Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petsc/private/deviceimpl.h>
16: #include <petscsf.h>
18: /* math2opusutils */
19: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
20: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
21: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
23: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
25: /* Use GPU only if H2OPUS is configured for GPU */
26: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
27: #define PETSC_H2OPUS_USE_GPU
28: #endif
29: #if defined(PETSC_H2OPUS_USE_GPU)
30: #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
31: #else
32: #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
33: #endif
35: // TODO H2OPUS:
36: // DistributedHMatrix
37: // unsymmetric ?
38: // transpose for distributed_hgemv?
39: // clearData()
40: // Unify interface for sequential and parallel?
41: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
42: //
43: template <class T>
44: class PetscPointCloud : public H2OpusDataSet<T> {
45: private:
46: int dimension;
47: size_t num_points;
48: std::vector<T> pts;
50: public:
51: PetscPointCloud(int dim, size_t num_pts, const T coords[])
52: {
53: dim = dim > 0 ? dim : 1;
54: this->dimension = dim;
55: this->num_points = num_pts;
57: pts.resize(num_pts * dim);
58: if (coords) {
59: for (size_t n = 0; n < num_pts; n++)
60: for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
61: } else {
62: PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
63: for (size_t n = 0; n < num_pts; n++) {
64: pts[n * dim] = n * h;
65: for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
66: }
67: }
68: }
70: PetscPointCloud(const PetscPointCloud<T> &other)
71: {
72: size_t N = other.dimension * other.num_points;
73: this->dimension = other.dimension;
74: this->num_points = other.num_points;
75: this->pts.resize(N);
76: for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
77: }
79: int getDimension() const { return dimension; }
81: size_t getDataSetSize() const { return num_points; }
83: T getDataPoint(size_t idx, int dim) const
84: {
85: assert(dim < dimension && idx < num_points);
86: return pts[idx * dimension + dim];
87: }
89: void Print(std::ostream &out = std::cout)
90: {
91: out << "Dimension: " << dimension << std::endl;
92: out << "NumPoints: " << num_points << std::endl;
93: for (size_t n = 0; n < num_points; n++) {
94: for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
95: out << std::endl;
96: }
97: }
98: };
100: template <class T>
101: class PetscFunctionGenerator {
102: private:
103: MatH2OpusKernelFn *k;
104: int dim;
105: void *ctx;
107: public:
108: PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx)
109: {
110: this->k = k;
111: this->dim = dim;
112: this->ctx = ctx;
113: }
114: PetscFunctionGenerator(PetscFunctionGenerator &other)
115: {
116: this->k = other.k;
117: this->dim = other.dim;
118: this->ctx = other.ctx;
119: }
120: T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
121: };
123: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
125: /* just to not clutter the code */
126: #if !defined(H2OPUS_USE_GPU)
127: typedef HMatrix HMatrix_GPU;
128: #if defined(H2OPUS_USE_MPI)
129: typedef DistributedHMatrix DistributedHMatrix_GPU;
130: #endif
131: #endif
133: typedef struct {
134: #if defined(H2OPUS_USE_MPI)
135: distributedH2OpusHandle_t handle;
136: #else
137: h2opusHandle_t handle;
138: #endif
139: /* Sequential and parallel matrices are two different classes at the moment */
140: HMatrix *hmatrix;
141: #if defined(H2OPUS_USE_MPI)
142: DistributedHMatrix *dist_hmatrix;
143: #else
144: HMatrix *dist_hmatrix; /* just to not clutter the code */
145: #endif
146: /* May use permutations */
147: PetscSF sf;
148: PetscLayout h2opus_rmap, h2opus_cmap;
149: IS h2opus_indexmap;
150: thrust::host_vector<PetscScalar> *xx, *yy;
151: PetscInt xxs, yys;
152: PetscBool multsetup;
154: /* GPU */
155: HMatrix_GPU *hmatrix_gpu;
156: #if defined(H2OPUS_USE_MPI)
157: DistributedHMatrix_GPU *dist_hmatrix_gpu;
158: #else
159: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
160: #endif
161: #if defined(PETSC_H2OPUS_USE_GPU)
162: thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
163: PetscInt xxs_gpu, yys_gpu;
164: #endif
166: /* construction from matvecs */
167: PetscMatrixSampler *sampler;
168: PetscBool nativemult;
170: /* Admissibility */
171: PetscReal eta;
172: PetscInt leafsize;
174: /* for dof reordering */
175: PetscPointCloud<PetscReal> *ptcloud;
177: /* kernel for generating matrix entries */
178: PetscFunctionGenerator<PetscScalar> *kernel;
180: /* basis orthogonalized? */
181: PetscBool orthogonal;
183: /* customization */
184: PetscInt basisord;
185: PetscInt max_rank;
186: PetscInt bs;
187: PetscReal rtol;
188: PetscInt norm_max_samples;
189: PetscBool check_construction;
190: PetscBool hara_verbose;
191: PetscBool resize;
193: /* keeps track of MatScale values */
194: PetscScalar s;
195: } Mat_H2OPUS;
197: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
198: {
199: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
201: PetscFunctionBegin;
202: #if defined(H2OPUS_USE_MPI)
203: h2opusDestroyDistributedHandle(a->handle);
204: #else
205: h2opusDestroyHandle(a->handle);
206: #endif
207: delete a->dist_hmatrix;
208: delete a->hmatrix;
209: PetscCall(PetscSFDestroy(&a->sf));
210: PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
211: PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
212: PetscCall(ISDestroy(&a->h2opus_indexmap));
213: delete a->xx;
214: delete a->yy;
215: delete a->hmatrix_gpu;
216: delete a->dist_hmatrix_gpu;
217: #if defined(PETSC_H2OPUS_USE_GPU)
218: delete a->xx_gpu;
219: delete a->yy_gpu;
220: #endif
221: delete a->sampler;
222: delete a->ptcloud;
223: delete a->kernel;
224: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
225: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
226: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
227: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
228: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
229: PetscCall(PetscFree(A->data));
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
234: {
235: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
236: PetscBool ish2opus;
238: PetscFunctionBegin;
241: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
242: if (ish2opus) {
243: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245: PetscLayout t;
246: t = A->rmap;
247: A->rmap = a->h2opus_rmap;
248: a->h2opus_rmap = t;
249: t = A->cmap;
250: A->cmap = a->h2opus_cmap;
251: a->h2opus_cmap = t;
252: }
253: }
254: a->nativemult = nm;
255: }
256: PetscFunctionReturn(PETSC_SUCCESS);
257: }
259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
262: PetscBool ish2opus;
264: PetscFunctionBegin;
266: PetscAssertPointer(nm, 2);
267: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
268: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
269: *nm = a->nativemult;
270: PetscFunctionReturn(PETSC_SUCCESS);
271: }
273: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
274: {
275: PetscBool ish2opus;
276: PetscInt nmax = PETSC_DECIDE;
277: Mat_H2OPUS *a = NULL;
278: PetscBool mult = PETSC_FALSE;
280: PetscFunctionBegin;
281: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
282: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
283: a = (Mat_H2OPUS *)A->data;
285: nmax = a->norm_max_samples;
286: mult = a->nativemult;
287: PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
288: } else {
289: PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
290: }
291: PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
292: if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
293: PetscFunctionReturn(PETSC_SUCCESS);
294: }
296: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
297: {
298: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
299: PetscInt n;
300: PetscBool boundtocpu = PETSC_TRUE;
302: PetscFunctionBegin;
303: #if defined(PETSC_H2OPUS_USE_GPU)
304: boundtocpu = A->boundtocpu;
305: #endif
306: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
307: if (boundtocpu) {
308: if (h2opus->xxs < xN) {
309: h2opus->xx->resize(n * xN);
310: h2opus->xxs = xN;
311: }
312: if (h2opus->yys < yN) {
313: h2opus->yy->resize(n * yN);
314: h2opus->yys = yN;
315: }
316: }
317: #if defined(PETSC_H2OPUS_USE_GPU)
318: if (!boundtocpu) {
319: if (h2opus->xxs_gpu < xN) {
320: h2opus->xx_gpu->resize(n * xN);
321: h2opus->xxs_gpu = xN;
322: }
323: if (h2opus->yys_gpu < yN) {
324: h2opus->yy_gpu->resize(n * yN);
325: h2opus->yys_gpu = yN;
326: }
327: }
328: #endif
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
333: {
334: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
335: #if defined(H2OPUS_USE_MPI)
336: h2opusHandle_t handle = h2opus->handle->handle;
337: #else
338: h2opusHandle_t handle = h2opus->handle;
339: #endif
340: PetscBool boundtocpu = PETSC_TRUE;
341: PetscScalar *xx, *yy, *uxx, *uyy;
342: PetscInt blda, clda;
343: PetscMPIInt size;
344: PetscSF bsf, csf;
345: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
347: PetscFunctionBegin;
348: HLibProfile::clear();
349: #if defined(PETSC_H2OPUS_USE_GPU)
350: boundtocpu = A->boundtocpu;
351: #endif
352: PetscCall(MatDenseGetLDA(B, &blda));
353: PetscCall(MatDenseGetLDA(C, &clda));
354: if (usesf) {
355: PetscInt n;
357: PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
358: PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));
360: PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
361: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
362: blda = n;
363: clda = n;
364: }
365: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
366: if (boundtocpu) {
367: PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
368: PetscCall(MatDenseGetArrayWrite(C, &yy));
369: if (usesf) {
370: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
371: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
372: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
373: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374: } else {
375: uxx = xx;
376: uyy = yy;
377: }
378: if (size > 1) {
379: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
380: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
381: #if defined(H2OPUS_USE_MPI)
382: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
383: #endif
384: } else {
385: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
386: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
387: }
388: PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
389: if (usesf) {
390: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
391: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392: }
393: PetscCall(MatDenseRestoreArrayWrite(C, &yy));
394: #if defined(PETSC_H2OPUS_USE_GPU)
395: } else {
396: PetscBool ciscuda, biscuda;
398: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
399: PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
400: if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
401: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
402: if (!ciscuda) {
403: C->assembled = PETSC_TRUE;
404: PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
405: }
406: PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
407: PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
408: if (usesf) {
409: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
410: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
411: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
412: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413: } else {
414: uxx = xx;
415: uyy = yy;
416: }
417: PetscCall(PetscLogGpuTimeBegin());
418: if (size > 1) {
419: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
420: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
421: #if defined(H2OPUS_USE_MPI)
422: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
423: #endif
424: } else {
425: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
426: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
427: }
428: PetscCall(PetscLogGpuTimeEnd());
429: PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
430: if (usesf) {
431: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
432: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433: }
434: PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
435: if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
436: if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
437: #endif
438: }
439: { /* log flops */
440: double gops, time, perf, dev;
441: HLibProfile::getHgemvPerf(gops, time, perf, dev);
442: #if defined(PETSC_H2OPUS_USE_GPU)
443: if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
444: else PetscCall(PetscLogGpuFlops(1e9 * gops));
445: #else
446: PetscCall(PetscLogFlops(1e9 * gops));
447: #endif
448: }
449: PetscFunctionReturn(PETSC_SUCCESS);
450: }
452: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
453: {
454: Mat_Product *product = C->product;
456: PetscFunctionBegin;
457: MatCheckProduct(C, 1);
458: switch (product->type) {
459: case MATPRODUCT_AB:
460: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
461: break;
462: case MATPRODUCT_AtB:
463: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
464: break;
465: default:
466: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
467: }
468: PetscFunctionReturn(PETSC_SUCCESS);
469: }
471: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
472: {
473: Mat_Product *product = C->product;
474: PetscBool cisdense;
475: Mat A, B;
477: PetscFunctionBegin;
478: MatCheckProduct(C, 1);
479: A = product->A;
480: B = product->B;
481: switch (product->type) {
482: case MATPRODUCT_AB:
483: PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
484: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
485: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
486: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
487: PetscCall(MatSetUp(C));
488: break;
489: case MATPRODUCT_AtB:
490: PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
491: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
492: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
493: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
494: PetscCall(MatSetUp(C));
495: break;
496: default:
497: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
498: }
499: C->ops->productsymbolic = NULL;
500: C->ops->productnumeric = MatProductNumeric_H2OPUS;
501: PetscFunctionReturn(PETSC_SUCCESS);
502: }
504: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
505: {
506: PetscFunctionBegin;
507: MatCheckProduct(C, 1);
508: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
509: PetscFunctionReturn(PETSC_SUCCESS);
510: }
512: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
513: {
514: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
515: #if defined(H2OPUS_USE_MPI)
516: h2opusHandle_t handle = h2opus->handle->handle;
517: #else
518: h2opusHandle_t handle = h2opus->handle;
519: #endif
520: PetscBool boundtocpu = PETSC_TRUE;
521: PetscInt n;
522: PetscScalar *xx, *yy, *uxx, *uyy;
523: PetscMPIInt size;
524: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
526: PetscFunctionBegin;
527: HLibProfile::clear();
528: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
529: #if defined(PETSC_H2OPUS_USE_GPU)
530: boundtocpu = A->boundtocpu;
531: #endif
532: if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
533: else n = A->rmap->n;
534: if (boundtocpu) {
535: PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
536: if (sy == 0.0) {
537: PetscCall(VecGetArrayWrite(y, &yy));
538: } else {
539: PetscCall(VecGetArray(y, &yy));
540: }
541: if (usesf) {
542: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
543: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
545: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
546: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
547: if (sy != 0.0) {
548: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
549: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
550: }
551: } else {
552: uxx = xx;
553: uyy = yy;
554: }
555: if (size > 1) {
556: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
557: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
558: #if defined(H2OPUS_USE_MPI)
559: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
560: #endif
561: } else {
562: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
563: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
564: }
565: PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
566: if (usesf) {
567: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
568: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
569: }
570: if (sy == 0.0) {
571: PetscCall(VecRestoreArrayWrite(y, &yy));
572: } else {
573: PetscCall(VecRestoreArray(y, &yy));
574: }
575: #if defined(PETSC_H2OPUS_USE_GPU)
576: } else {
577: PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
578: if (sy == 0.0) {
579: PetscCall(VecCUDAGetArrayWrite(y, &yy));
580: } else {
581: PetscCall(VecCUDAGetArray(y, &yy));
582: }
583: if (usesf) {
584: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
585: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
587: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
588: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
589: if (sy != 0.0) {
590: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
591: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
592: }
593: } else {
594: uxx = xx;
595: uyy = yy;
596: }
597: PetscCall(PetscLogGpuTimeBegin());
598: if (size > 1) {
599: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
600: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
601: #if defined(H2OPUS_USE_MPI)
602: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
603: #endif
604: } else {
605: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
606: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
607: }
608: PetscCall(PetscLogGpuTimeEnd());
609: PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
610: if (usesf) {
611: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
612: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
613: }
614: if (sy == 0.0) {
615: PetscCall(VecCUDARestoreArrayWrite(y, &yy));
616: } else {
617: PetscCall(VecCUDARestoreArray(y, &yy));
618: }
619: #endif
620: }
621: { /* log flops */
622: double gops, time, perf, dev;
623: HLibProfile::getHgemvPerf(gops, time, perf, dev);
624: #if defined(PETSC_H2OPUS_USE_GPU)
625: if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
626: else PetscCall(PetscLogGpuFlops(1e9 * gops));
627: #else
628: PetscCall(PetscLogFlops(1e9 * gops));
629: #endif
630: }
631: PetscFunctionReturn(PETSC_SUCCESS);
632: }
634: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
635: {
636: PetscBool xiscuda, yiscuda;
638: PetscFunctionBegin;
639: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
640: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
641: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
642: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
643: PetscFunctionReturn(PETSC_SUCCESS);
644: }
646: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
647: {
648: PetscBool xiscuda, yiscuda;
650: PetscFunctionBegin;
651: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
652: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
653: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
654: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
655: PetscFunctionReturn(PETSC_SUCCESS);
656: }
658: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
659: {
660: PetscBool xiscuda, ziscuda;
662: PetscFunctionBegin;
663: PetscCall(VecCopy(y, z));
664: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
665: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
666: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
667: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
668: PetscFunctionReturn(PETSC_SUCCESS);
669: }
671: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
672: {
673: PetscBool xiscuda, ziscuda;
675: PetscFunctionBegin;
676: PetscCall(VecCopy(y, z));
677: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
678: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
679: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
680: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
681: PetscFunctionReturn(PETSC_SUCCESS);
682: }
684: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
685: {
686: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
688: PetscFunctionBegin;
689: a->s *= s;
690: PetscFunctionReturn(PETSC_SUCCESS);
691: }
693: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject)
694: {
695: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
697: PetscFunctionBegin;
698: PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
699: PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
700: PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
701: PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
702: PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
703: PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
704: PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
705: PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
706: PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
707: PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
708: PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
709: PetscOptionsHeadEnd();
710: PetscFunctionReturn(PETSC_SUCCESS);
711: }
713: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);
715: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
716: {
717: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
718: Vec c;
719: PetscInt spacedim;
720: const PetscScalar *coords;
722: PetscFunctionBegin;
723: if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
724: PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
725: if (!c && a->sampler) {
726: Mat S = a->sampler->GetSamplingMat();
728: PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
729: }
730: if (!c) {
731: PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
732: } else {
733: PetscCall(VecGetArrayRead(c, &coords));
734: PetscCall(VecGetBlockSize(c, &spacedim));
735: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
736: PetscCall(VecRestoreArrayRead(c, &coords));
737: }
738: PetscFunctionReturn(PETSC_SUCCESS);
739: }
741: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
742: {
743: MPI_Comm comm;
744: PetscMPIInt size;
745: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
746: PetscInt n = 0, *idx = NULL;
747: int *iidx = NULL;
748: PetscCopyMode own;
749: PetscBool rid;
751: PetscFunctionBegin;
752: if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
753: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
754: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
755: #if defined(PETSC_H2OPUS_USE_GPU)
756: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
757: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
758: a->xxs_gpu = 1;
759: a->yys_gpu = 1;
760: #endif
761: a->xx = new thrust::host_vector<PetscScalar>(n);
762: a->yy = new thrust::host_vector<PetscScalar>(n);
763: a->xxs = 1;
764: a->yys = 1;
765: } else {
766: IS is;
767: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
768: PetscCallMPI(MPI_Comm_size(comm, &size));
769: if (!a->h2opus_indexmap) {
770: if (size > 1) {
771: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
772: #if defined(H2OPUS_USE_MPI)
773: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
774: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
775: #endif
776: } else {
777: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
778: n = a->hmatrix->u_basis_tree.index_map.size();
779: }
781: if (PetscDefined(USE_64BIT_INDICES)) {
782: PetscInt i;
784: own = PETSC_OWN_POINTER;
785: PetscCall(PetscMalloc1(n, &idx));
786: for (i = 0; i < n; i++) idx[i] = iidx[i];
787: } else {
788: own = PETSC_COPY_VALUES;
789: idx = (PetscInt *)iidx;
790: }
791: PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
792: PetscCall(ISSetPermutation(is));
793: PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
794: a->h2opus_indexmap = is;
795: }
796: PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
797: PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
798: rid = (PetscBool)(n == A->rmap->n);
799: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm));
800: if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
801: if (!rid) {
802: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
803: PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
804: PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
805: PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
806: PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
807: }
808: PetscCall(PetscSFCreate(comm, &a->sf));
809: PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
810: PetscCall(PetscSFSetUp(a->sf));
811: PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
812: #if defined(PETSC_H2OPUS_USE_GPU)
813: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
814: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
815: a->xxs_gpu = 1;
816: a->yys_gpu = 1;
817: #endif
818: a->xx = new thrust::host_vector<PetscScalar>(n);
819: a->yy = new thrust::host_vector<PetscScalar>(n);
820: a->xxs = 1;
821: a->yys = 1;
822: }
823: PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
824: }
825: a->multsetup = PETSC_TRUE;
826: PetscFunctionReturn(PETSC_SUCCESS);
827: }
829: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
830: {
831: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
832: #if defined(H2OPUS_USE_MPI)
833: h2opusHandle_t handle = a->handle->handle;
834: #else
835: h2opusHandle_t handle = a->handle;
836: #endif
837: PetscBool kernel = PETSC_FALSE;
838: PetscBool boundtocpu = PETSC_TRUE;
839: PetscBool samplingdone = PETSC_FALSE;
840: MPI_Comm comm;
841: PetscMPIInt size;
843: PetscFunctionBegin;
844: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
845: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
846: PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
848: /* XXX */
849: a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
851: PetscCallMPI(MPI_Comm_size(comm, &size));
852: /* TODO REUSABILITY of geometric construction */
853: delete a->hmatrix;
854: delete a->dist_hmatrix;
855: #if defined(PETSC_H2OPUS_USE_GPU)
856: delete a->hmatrix_gpu;
857: delete a->dist_hmatrix_gpu;
858: #endif
859: a->orthogonal = PETSC_FALSE;
861: /* TODO: other? */
862: H2OpusBoxCenterAdmissibility adm(a->eta);
864: PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
865: if (size > 1) {
866: #if defined(H2OPUS_USE_MPI)
867: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
868: #else
869: a->dist_hmatrix = NULL;
870: #endif
871: } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
872: PetscCall(MatH2OpusInferCoordinates_Private(A));
873: PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
874: if (a->kernel) {
875: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
876: if (size > 1) {
877: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
878: #if defined(H2OPUS_USE_MPI)
879: buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
880: #endif
881: } else {
882: buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
883: }
884: kernel = PETSC_TRUE;
885: } else {
886: PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
887: buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
888: }
889: PetscCall(MatSetUpMultiply_H2OPUS(A));
891: #if defined(PETSC_H2OPUS_USE_GPU)
892: boundtocpu = A->boundtocpu;
893: if (!boundtocpu) {
894: if (size > 1) {
895: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
896: #if defined(H2OPUS_USE_MPI)
897: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
898: #endif
899: } else {
900: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
901: }
902: }
903: #endif
904: if (size == 1) {
905: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
906: PetscReal Anorm;
907: bool verbose;
909: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
910: verbose = a->hara_verbose;
911: PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
912: if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
913: if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
914: a->sampler->SetStream(handle->getMainStream());
915: if (boundtocpu) {
916: a->sampler->SetGPUSampling(false);
917: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
918: #if defined(PETSC_H2OPUS_USE_GPU)
919: } else {
920: a->sampler->SetGPUSampling(true);
921: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
922: #endif
923: }
924: samplingdone = PETSC_TRUE;
925: }
926: }
927: #if defined(PETSC_H2OPUS_USE_GPU)
928: if (!boundtocpu) {
929: delete a->hmatrix;
930: delete a->dist_hmatrix;
931: a->hmatrix = NULL;
932: a->dist_hmatrix = NULL;
933: }
934: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
935: #endif
936: PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
938: if (!a->s) a->s = 1.0;
939: A->assembled = PETSC_TRUE;
941: if (samplingdone) {
942: PetscBool check = a->check_construction;
943: PetscBool checke = PETSC_FALSE;
945: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
946: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
947: if (check) {
948: Mat E, Ae;
949: PetscReal n1, ni, n2;
950: PetscReal n1A, niA, n2A;
951: PetscErrorCodeFn *normfunc;
953: Ae = a->sampler->GetSamplingMat();
954: PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
955: PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
956: PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
957: PetscCall(MatNorm(E, NORM_1, &n1));
958: PetscCall(MatNorm(E, NORM_INFINITY, &ni));
959: PetscCall(MatNorm(E, NORM_2, &n2));
960: if (checke) {
961: Mat eA, eE, eAe;
963: PetscCall(MatComputeOperator(A, MATAIJ, &eA));
964: PetscCall(MatComputeOperator(E, MATAIJ, &eE));
965: PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
966: PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
967: PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
968: PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
969: PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
970: PetscCall(MatView(eA, NULL));
971: PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
972: PetscCall(MatView(eAe, NULL));
973: PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
974: PetscCall(MatView(eE, NULL));
975: PetscCall(MatDestroy(&eA));
976: PetscCall(MatDestroy(&eE));
977: PetscCall(MatDestroy(&eAe));
978: }
980: PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
981: PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
982: PetscCall(MatNorm(Ae, NORM_1, &n1A));
983: PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
984: PetscCall(MatNorm(Ae, NORM_2, &n2A));
985: n1A = PetscMax(n1A, PETSC_SMALL);
986: n2A = PetscMax(n2A, PETSC_SMALL);
987: niA = PetscMax(niA, PETSC_SMALL);
988: PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
989: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
990: PetscCall(MatDestroy(&E));
991: }
992: a->sampler->SetSamplingMat(NULL);
993: }
994: PetscFunctionReturn(PETSC_SUCCESS);
995: }
997: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
998: {
999: PetscMPIInt size;
1000: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1002: PetscFunctionBegin;
1003: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1004: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1005: a->hmatrix->clearData();
1006: #if defined(PETSC_H2OPUS_USE_GPU)
1007: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1008: #endif
1009: PetscFunctionReturn(PETSC_SUCCESS);
1010: }
1012: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1013: {
1014: Mat A;
1015: Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1016: PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1017: MPI_Comm comm;
1019: PetscFunctionBegin;
1020: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1021: PetscCall(MatCreate(comm, &A));
1022: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1023: PetscCall(MatSetType(A, MATH2OPUS));
1024: PetscCall(MatPropagateSymmetryOptions(B, A));
1025: a = (Mat_H2OPUS *)A->data;
1027: a->eta = b->eta;
1028: a->leafsize = b->leafsize;
1029: a->basisord = b->basisord;
1030: a->max_rank = b->max_rank;
1031: a->bs = b->bs;
1032: a->rtol = b->rtol;
1033: a->norm_max_samples = b->norm_max_samples;
1034: if (op == MAT_COPY_VALUES) a->s = b->s;
1036: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1037: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1039: #if defined(H2OPUS_USE_MPI)
1040: if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1041: #if defined(PETSC_H2OPUS_USE_GPU)
1042: if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1043: #endif
1044: #endif
1045: if (b->hmatrix) {
1046: a->hmatrix = new HMatrix(*b->hmatrix);
1047: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1048: }
1049: #if defined(PETSC_H2OPUS_USE_GPU)
1050: if (b->hmatrix_gpu) {
1051: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1052: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1053: }
1054: #endif
1055: if (b->sf) {
1056: PetscCall(PetscObjectReference((PetscObject)b->sf));
1057: a->sf = b->sf;
1058: }
1059: if (b->h2opus_indexmap) {
1060: PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1061: a->h2opus_indexmap = b->h2opus_indexmap;
1062: }
1064: PetscCall(MatSetUp(A));
1065: PetscCall(MatSetUpMultiply_H2OPUS(A));
1066: if (op == MAT_COPY_VALUES) {
1067: A->assembled = PETSC_TRUE;
1068: a->orthogonal = b->orthogonal;
1069: #if defined(PETSC_H2OPUS_USE_GPU)
1070: A->offloadmask = B->offloadmask;
1071: #endif
1072: }
1073: #if defined(PETSC_H2OPUS_USE_GPU)
1074: iscpu = B->boundtocpu;
1075: #endif
1076: PetscCall(MatBindToCPU(A, iscpu));
1078: *nA = A;
1079: PetscFunctionReturn(PETSC_SUCCESS);
1080: }
1082: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1083: {
1084: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1085: PetscBool isascii, vieweps;
1086: PetscMPIInt size;
1087: PetscViewerFormat format;
1089: PetscFunctionBegin;
1090: PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1091: PetscCall(PetscViewerGetFormat(view, &format));
1092: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1093: if (isascii) {
1094: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1095: if (size == 1) {
1096: FILE *fp;
1097: PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1098: dumpHMatrix(*h2opus->hmatrix, 6, fp);
1099: }
1100: } else {
1101: PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1102: PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1103: PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1104: if (!h2opus->kernel) {
1105: PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1106: } else {
1107: PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1108: }
1109: PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1110: if (size == 1) {
1111: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1112: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1113: #if defined(PETSC_HAVE_CUDA)
1114: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1115: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1116: #endif
1117: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1118: #if defined(PETSC_HAVE_CUDA)
1119: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1120: #endif
1121: } else {
1122: #if defined(PETSC_HAVE_CUDA)
1123: double matrix_mem[4] = {0., 0., 0., 0.};
1124: PetscMPIInt rsize = 4;
1125: #else
1126: double matrix_mem[2] = {0., 0.};
1127: PetscMPIInt rsize = 2;
1128: #endif
1129: #if defined(H2OPUS_USE_MPI)
1130: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1131: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1132: #if defined(PETSC_HAVE_CUDA)
1133: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1134: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1135: #endif
1136: #endif
1137: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1138: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1139: #if defined(PETSC_HAVE_CUDA)
1140: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1141: #endif
1142: }
1143: }
1144: }
1145: vieweps = PETSC_FALSE;
1146: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1147: if (vieweps) {
1148: char filename[256];
1149: const char *name;
1151: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1152: PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1153: PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1154: outputEps(*h2opus->hmatrix, filename);
1155: }
1156: PetscFunctionReturn(PETSC_SUCCESS);
1157: }
1159: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1160: {
1161: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1162: PetscReal *gcoords;
1163: PetscInt N;
1164: MPI_Comm comm;
1165: PetscMPIInt size;
1166: PetscBool cong;
1168: PetscFunctionBegin;
1169: PetscCall(PetscLayoutSetUp(A->rmap));
1170: PetscCall(PetscLayoutSetUp(A->cmap));
1171: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1172: PetscCall(MatHasCongruentLayouts(A, &cong));
1173: PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1174: N = A->rmap->N;
1175: PetscCallMPI(MPI_Comm_size(comm, &size));
1176: if (spacedim > 0 && size > 1 && cdist) {
1177: PetscSF sf;
1178: MPI_Datatype dtype;
1180: PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1181: PetscCallMPI(MPI_Type_commit(&dtype));
1183: PetscCall(PetscSFCreate(comm, &sf));
1184: PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1185: PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1186: PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1187: PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1188: PetscCall(PetscSFDestroy(&sf));
1189: PetscCallMPI(MPI_Type_free(&dtype));
1190: } else gcoords = (PetscReal *)coords;
1192: delete h2opus->ptcloud;
1193: delete h2opus->kernel;
1194: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1195: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1196: if (gcoords != coords) PetscCall(PetscFree(gcoords));
1197: A->preallocated = PETSC_TRUE;
1198: PetscFunctionReturn(PETSC_SUCCESS);
1199: }
1201: #if defined(PETSC_H2OPUS_USE_GPU)
1202: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1203: {
1204: PetscMPIInt size;
1205: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1207: PetscFunctionBegin;
1208: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1209: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1210: if (size > 1) {
1211: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1212: #if defined(H2OPUS_USE_MPI)
1213: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1214: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1215: #endif
1216: } else {
1217: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1218: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1219: else *a->hmatrix = *a->hmatrix_gpu;
1220: }
1221: delete a->hmatrix_gpu;
1222: delete a->dist_hmatrix_gpu;
1223: a->hmatrix_gpu = NULL;
1224: a->dist_hmatrix_gpu = NULL;
1225: A->offloadmask = PETSC_OFFLOAD_CPU;
1226: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1227: if (size > 1) {
1228: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1229: #if defined(H2OPUS_USE_MPI)
1230: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1231: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1232: #endif
1233: } else {
1234: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1235: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1236: else *a->hmatrix_gpu = *a->hmatrix;
1237: }
1238: delete a->hmatrix;
1239: delete a->dist_hmatrix;
1240: a->hmatrix = NULL;
1241: a->dist_hmatrix = NULL;
1242: A->offloadmask = PETSC_OFFLOAD_GPU;
1243: }
1244: PetscCall(PetscFree(A->defaultvectype));
1245: if (!flg) {
1246: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1247: } else {
1248: PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1249: }
1250: A->boundtocpu = flg;
1251: PetscFunctionReturn(PETSC_SUCCESS);
1252: }
1253: #endif
1255: /*MC
1256: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1258: Options Database Key:
1259: . -mat_type h2opus - matrix type to "h2opus"
1261: Level: beginner
1263: Notes:
1264: H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
1266: For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1267: In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1269: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1270: M*/
1271: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1272: {
1273: Mat_H2OPUS *a;
1274: PetscMPIInt size;
1276: PetscFunctionBegin;
1277: #if defined(PETSC_H2OPUS_USE_GPU)
1278: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1279: #endif
1280: PetscCall(PetscNew(&a));
1281: A->data = (void *)a;
1283: a->eta = 0.9;
1284: a->leafsize = 32;
1285: a->basisord = 4;
1286: a->max_rank = 64;
1287: a->bs = 32;
1288: a->rtol = 1.e-4;
1289: a->s = 1.0;
1290: a->norm_max_samples = 10;
1291: a->resize = PETSC_TRUE; /* reallocate after compression */
1292: #if defined(H2OPUS_USE_MPI)
1293: h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1294: #else
1295: h2opusCreateHandle(&a->handle);
1296: #endif
1297: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1298: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1299: PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1301: A->ops->destroy = MatDestroy_H2OPUS;
1302: A->ops->view = MatView_H2OPUS;
1303: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1304: A->ops->mult = MatMult_H2OPUS;
1305: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1306: A->ops->multadd = MatMultAdd_H2OPUS;
1307: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1308: A->ops->scale = MatScale_H2OPUS;
1309: A->ops->duplicate = MatDuplicate_H2OPUS;
1310: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1311: A->ops->norm = MatNorm_H2OPUS;
1312: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1313: #if defined(PETSC_H2OPUS_USE_GPU)
1314: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1315: #endif
1317: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1318: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1319: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1320: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1321: #if defined(PETSC_H2OPUS_USE_GPU)
1322: PetscCall(PetscFree(A->defaultvectype));
1323: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1324: #endif
1325: PetscFunctionReturn(PETSC_SUCCESS);
1326: }
1328: /*@
1329: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1331: Input Parameter:
1332: . A - the matrix
1334: Level: intermediate
1336: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1337: @*/
1338: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1339: {
1340: PetscBool ish2opus;
1341: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1342: PetscMPIInt size;
1343: PetscBool boundtocpu = PETSC_TRUE;
1345: PetscFunctionBegin;
1348: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1349: if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1350: if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1351: HLibProfile::clear();
1352: PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1353: #if defined(PETSC_H2OPUS_USE_GPU)
1354: boundtocpu = A->boundtocpu;
1355: #endif
1356: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1357: if (size > 1) {
1358: if (boundtocpu) {
1359: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1360: #if defined(H2OPUS_USE_MPI)
1361: distributed_horthog(*a->dist_hmatrix, a->handle);
1362: #endif
1363: #if defined(PETSC_H2OPUS_USE_GPU)
1364: A->offloadmask = PETSC_OFFLOAD_CPU;
1365: } else {
1366: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1367: PetscCall(PetscLogGpuTimeBegin());
1368: #if defined(H2OPUS_USE_MPI)
1369: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1370: #endif
1371: PetscCall(PetscLogGpuTimeEnd());
1372: #endif
1373: }
1374: } else {
1375: #if defined(H2OPUS_USE_MPI)
1376: h2opusHandle_t handle = a->handle->handle;
1377: #else
1378: h2opusHandle_t handle = a->handle;
1379: #endif
1380: if (boundtocpu) {
1381: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1382: horthog(*a->hmatrix, handle);
1383: #if defined(PETSC_H2OPUS_USE_GPU)
1384: A->offloadmask = PETSC_OFFLOAD_CPU;
1385: } else {
1386: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1387: PetscCall(PetscLogGpuTimeBegin());
1388: horthog(*a->hmatrix_gpu, handle);
1389: PetscCall(PetscLogGpuTimeEnd());
1390: #endif
1391: }
1392: }
1393: a->orthogonal = PETSC_TRUE;
1394: { /* log flops */
1395: double gops, time, perf, dev;
1396: HLibProfile::getHorthogPerf(gops, time, perf, dev);
1397: #if defined(PETSC_H2OPUS_USE_GPU)
1398: if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
1399: else PetscCall(PetscLogGpuFlops(1e9 * gops));
1400: #else
1401: PetscCall(PetscLogFlops(1e9 * gops));
1402: #endif
1403: }
1404: PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1405: PetscFunctionReturn(PETSC_SUCCESS);
1406: }
1408: /*@
1409: MatH2OpusCompress - Compress a hierarchical matrix.
1411: Input Parameters:
1412: + A - the matrix
1413: - tol - the absolute truncation threshold
1415: Level: intermediate
1417: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1418: @*/
1419: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1420: {
1421: PetscBool ish2opus;
1422: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1423: PetscMPIInt size;
1424: PetscBool boundtocpu = PETSC_TRUE;
1426: PetscFunctionBegin;
1430: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1431: if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1432: PetscCall(MatH2OpusOrthogonalize(A));
1433: HLibProfile::clear();
1434: PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1435: #if defined(PETSC_H2OPUS_USE_GPU)
1436: boundtocpu = A->boundtocpu;
1437: #endif
1438: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1439: if (size > 1) {
1440: if (boundtocpu) {
1441: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1442: #if defined(H2OPUS_USE_MPI)
1443: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1444: if (a->resize) {
1445: DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1446: delete a->dist_hmatrix;
1447: a->dist_hmatrix = dist_hmatrix;
1448: }
1449: #endif
1450: #if defined(PETSC_H2OPUS_USE_GPU)
1451: A->offloadmask = PETSC_OFFLOAD_CPU;
1452: } else {
1453: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1454: PetscCall(PetscLogGpuTimeBegin());
1455: #if defined(H2OPUS_USE_MPI)
1456: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1458: if (a->resize) {
1459: DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1460: delete a->dist_hmatrix_gpu;
1461: a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1462: }
1463: #endif
1464: PetscCall(PetscLogGpuTimeEnd());
1465: #endif
1466: }
1467: } else {
1468: #if defined(H2OPUS_USE_MPI)
1469: h2opusHandle_t handle = a->handle->handle;
1470: #else
1471: h2opusHandle_t handle = a->handle;
1472: #endif
1473: if (boundtocpu) {
1474: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1475: hcompress(*a->hmatrix, tol, handle);
1477: if (a->resize) {
1478: HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1479: delete a->hmatrix;
1480: a->hmatrix = hmatrix;
1481: }
1482: #if defined(PETSC_H2OPUS_USE_GPU)
1483: A->offloadmask = PETSC_OFFLOAD_CPU;
1484: } else {
1485: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1486: PetscCall(PetscLogGpuTimeBegin());
1487: hcompress(*a->hmatrix_gpu, tol, handle);
1488: PetscCall(PetscLogGpuTimeEnd());
1490: if (a->resize) {
1491: HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1492: delete a->hmatrix_gpu;
1493: a->hmatrix_gpu = hmatrix_gpu;
1494: }
1495: #endif
1496: }
1497: }
1498: { /* log flops */
1499: double gops, time, perf, dev;
1500: HLibProfile::getHcompressPerf(gops, time, perf, dev);
1501: #if defined(PETSC_H2OPUS_USE_GPU)
1502: if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
1503: else PetscCall(PetscLogGpuFlops(1e9 * gops));
1504: #else
1505: PetscCall(PetscLogFlops(1e9 * gops));
1506: #endif
1507: }
1508: PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1509: PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1512: /*@
1513: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1515: Input Parameters:
1516: + A - the hierarchical matrix
1517: . B - the matrix to be sampled
1518: . bs - maximum number of samples to be taken concurrently
1519: - tol - relative tolerance for construction
1521: Level: intermediate
1523: Notes:
1524: You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1526: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1527: @*/
1528: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1529: {
1530: PetscBool ish2opus;
1532: PetscFunctionBegin;
1538: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1539: if (ish2opus) {
1540: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1542: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1543: a->sampler->SetSamplingMat(B);
1544: if (bs > 0) a->bs = bs;
1545: if (tol > 0.) a->rtol = tol;
1546: delete a->kernel;
1547: }
1548: PetscFunctionReturn(PETSC_SUCCESS);
1549: }
1551: /*@C
1552: MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1554: Input Parameters:
1555: + comm - MPI communicator
1556: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1557: . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1558: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1559: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1560: . spacedim - dimension of the space coordinates
1561: . coords - coordinates of the points
1562: . cdist - whether or not coordinates are distributed
1563: . kernel - computational kernel (or `NULL`)
1564: . kernelctx - kernel context
1565: . eta - admissibility condition tolerance
1566: . leafsize - leaf size in cluster tree
1567: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1569: Output Parameter:
1570: . nA - matrix
1572: Options Database Keys:
1573: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1574: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1575: . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1576: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1578: Level: intermediate
1580: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1581: @*/
1582: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1583: {
1584: Mat A;
1585: Mat_H2OPUS *h2opus;
1586: PetscBool iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1588: PetscFunctionBegin;
1589: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1590: PetscCall(MatCreate(comm, &A));
1591: PetscCall(MatSetSizes(A, m, n, M, N));
1592: PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1593: PetscCall(MatSetType(A, MATH2OPUS));
1594: PetscCall(MatBindToCPU(A, iscpu));
1595: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1597: h2opus = (Mat_H2OPUS *)A->data;
1598: if (eta > 0.) h2opus->eta = eta;
1599: if (leafsize > 0) h2opus->leafsize = leafsize;
1600: if (basisord > 0) h2opus->basisord = basisord;
1602: *nA = A;
1603: PetscFunctionReturn(PETSC_SUCCESS);
1604: }
1606: /*@
1607: MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1609: Input Parameters:
1610: + B - the matrix to be sampled
1611: . spacedim - dimension of the space coordinates
1612: . coords - coordinates of the points
1613: . cdist - whether or not coordinates are distributed
1614: . eta - admissibility condition tolerance
1615: . leafsize - leaf size in cluster tree
1616: . maxrank - maximum rank allowed
1617: . bs - maximum number of samples to be taken concurrently
1618: - rtol - relative tolerance for construction
1620: Output Parameter:
1621: . nA - matrix
1623: Options Database Keys:
1624: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1625: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1626: . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1627: . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1628: . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1629: . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1630: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1631: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1633: Level: intermediate
1635: Note:
1636: Not available in parallel
1638: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1639: @*/
1640: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1641: {
1642: Mat A;
1643: Mat_H2OPUS *h2opus;
1644: MPI_Comm comm;
1645: PetscBool boundtocpu = PETSC_TRUE;
1647: PetscFunctionBegin;
1656: PetscAssertPointer(nA, 10);
1657: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1658: PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1659: PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1660: PetscCall(MatCreate(comm, &A));
1661: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1662: #if defined(PETSC_H2OPUS_USE_GPU)
1663: {
1664: VecType vtype;
1665: PetscBool isstd, iscuda, iskok;
1667: PetscCall(MatGetVecType(B, &vtype));
1668: PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1669: PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1670: PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1671: PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1672: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1673: if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1674: }
1675: #endif
1676: PetscCall(MatSetType(A, MATH2OPUS));
1677: PetscCall(MatBindToCPU(A, boundtocpu));
1678: if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1679: PetscCall(MatPropagateSymmetryOptions(B, A));
1680: /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1682: h2opus = (Mat_H2OPUS *)A->data;
1683: h2opus->sampler = new PetscMatrixSampler(B);
1684: if (eta > 0.) h2opus->eta = eta;
1685: if (leafsize > 0) h2opus->leafsize = leafsize;
1686: if (maxrank > 0) h2opus->max_rank = maxrank;
1687: if (bs > 0) h2opus->bs = bs;
1688: if (rtol > 0.) h2opus->rtol = rtol;
1689: *nA = A;
1690: A->preallocated = PETSC_TRUE;
1691: PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1694: /*@
1695: MatH2OpusGetIndexMap - Access reordering index set.
1697: Input Parameter:
1698: . A - the matrix
1700: Output Parameter:
1701: . indexmap - the index set for the reordering
1703: Level: intermediate
1705: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1706: @*/
1707: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1708: {
1709: PetscBool ish2opus;
1710: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1712: PetscFunctionBegin;
1715: PetscAssertPointer(indexmap, 2);
1716: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1717: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1718: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1719: *indexmap = a->h2opus_indexmap;
1720: PetscFunctionReturn(PETSC_SUCCESS);
1721: }
1723: /*@
1724: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1726: Input Parameters:
1727: + A - the matrix
1728: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1729: - in - the vector to be mapped
1731: Output Parameter:
1732: . out - the newly created mapped vector
1734: Level: intermediate
1736: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1737: @*/
1738: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1739: {
1740: PetscBool ish2opus;
1741: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1742: PetscScalar *xin, *xout;
1743: PetscBool nm;
1745: PetscFunctionBegin;
1750: PetscAssertPointer(out, 4);
1751: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1752: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1753: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1754: nm = a->nativemult;
1755: PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1756: PetscCall(MatCreateVecs(A, out, NULL));
1757: PetscCall(MatH2OpusSetNativeMult(A, nm));
1758: if (!a->sf) { /* same ordering */
1759: PetscCall(VecCopy(in, *out));
1760: PetscFunctionReturn(PETSC_SUCCESS);
1761: }
1762: PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1763: PetscCall(VecGetArrayWrite(*out, &xout));
1764: if (nativetopetsc) {
1765: PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1766: PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1767: } else {
1768: PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1769: PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1770: }
1771: PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1772: PetscCall(VecRestoreArrayWrite(*out, &xout));
1773: PetscFunctionReturn(PETSC_SUCCESS);
1774: }
1776: /*@
1777: MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1779: Input Parameters:
1780: + A - the hierarchical `MATH2OPUS` matrix
1781: . s - the scaling factor
1782: . U - the dense low-rank update matrix
1783: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1785: Note:
1786: The `U` and `V` matrices must be in `MATDENSE` dense format
1788: Level: intermediate
1790: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1791: @*/
1792: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1793: {
1794: PetscBool flg;
1796: PetscFunctionBegin;
1799: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1801: PetscCheckSameComm(A, 1, U, 2);
1802: if (V) {
1804: PetscCheckSameComm(A, 1, V, 3);
1805: }
1808: if (!V) V = U;
1809: PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1810: if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1811: PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1812: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1813: PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1814: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1815: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1816: if (flg) {
1817: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1818: const PetscScalar *u, *v, *uu, *vv;
1819: PetscInt ldu, ldv;
1820: PetscMPIInt size;
1821: #if defined(H2OPUS_USE_MPI)
1822: h2opusHandle_t handle = a->handle->handle;
1823: #else
1824: h2opusHandle_t handle = a->handle;
1825: #endif
1826: PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1827: PetscSF usf, vsf;
1829: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1830: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1831: PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1832: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1833: PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1834: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1835: PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1836: PetscCall(MatDenseGetLDA(U, &ldu));
1837: PetscCall(MatDenseGetLDA(V, &ldv));
1838: PetscCall(MatBoundToCPU(A, &flg));
1839: if (usesf) {
1840: PetscInt n;
1842: PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1843: PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1844: PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1845: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1846: ldu = n;
1847: ldv = n;
1848: }
1849: if (flg) {
1850: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1851: PetscCall(MatDenseGetArrayRead(U, &u));
1852: PetscCall(MatDenseGetArrayRead(V, &v));
1853: if (usesf) {
1854: vv = MatH2OpusGetThrustPointer(*a->yy);
1855: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1856: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1857: if (U != V) {
1858: uu = MatH2OpusGetThrustPointer(*a->xx);
1859: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1860: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1861: } else uu = vv;
1862: } else {
1863: uu = u;
1864: vv = v;
1865: }
1866: hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1867: PetscCall(MatDenseRestoreArrayRead(U, &u));
1868: PetscCall(MatDenseRestoreArrayRead(V, &v));
1869: } else {
1870: #if defined(PETSC_H2OPUS_USE_GPU)
1871: PetscBool flgU, flgV;
1873: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1874: PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1875: if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1876: PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1877: if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1878: PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1879: PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1880: if (usesf) {
1881: vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1882: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1883: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1884: if (U != V) {
1885: uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1886: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1887: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1888: } else uu = vv;
1889: } else {
1890: uu = u;
1891: vv = v;
1892: }
1893: #else
1894: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1895: #endif
1896: hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1897: #if defined(PETSC_H2OPUS_USE_GPU)
1898: PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1899: PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1900: if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1901: if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1902: #endif
1903: }
1904: PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1905: a->orthogonal = PETSC_FALSE;
1906: }
1907: PetscFunctionReturn(PETSC_SUCCESS);
1908: }
1909: #endif