Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4:   #include <h2opus.h>
  5:   #if defined(H2OPUS_USE_MPI)
  6:     #include <h2opus/distributed/distributed_h2opus_handle.h>
  7:     #include <h2opus/distributed/distributed_geometric_construction.h>
  8:     #include <h2opus/distributed/distributed_hgemv.h>
  9:     #include <h2opus/distributed/distributed_horthog.h>
 10:     #include <h2opus/distributed/distributed_hcompress.h>
 11:   #endif
 12:   #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
 20: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
 21: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);

 23:   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 25:   /* Use GPU only if H2OPUS is configured for GPU */
 26:   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 27:     #define PETSC_H2OPUS_USE_GPU
 28:   #endif
 29:   #if defined(PETSC_H2OPUS_USE_GPU)
 30:     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
 31:   #else
 32:     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
 33:   #endif

 35: // TODO H2OPUS:
 36: // DistributedHMatrix
 37: //   unsymmetric ?
 38: //   transpose for distributed_hgemv?
 39: //   clearData()
 40: // Unify interface for sequential and parallel?
 41: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 42: //
 43: template <class T>
 44: class PetscPointCloud : public H2OpusDataSet<T> {
 45: private:
 46:   int            dimension;
 47:   size_t         num_points;
 48:   std::vector<T> pts;

 50: public:
 51:   PetscPointCloud(int dim, size_t num_pts, const T coords[])
 52:   {
 53:     dim              = dim > 0 ? dim : 1;
 54:     this->dimension  = dim;
 55:     this->num_points = num_pts;

 57:     pts.resize(num_pts * dim);
 58:     if (coords) {
 59:       for (size_t n = 0; n < num_pts; n++)
 60:         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
 61:     } else {
 62:       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
 63:       for (size_t n = 0; n < num_pts; n++) {
 64:         pts[n * dim] = n * h;
 65:         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
 66:       }
 67:     }
 68:   }

 70:   PetscPointCloud(const PetscPointCloud<T> &other)
 71:   {
 72:     size_t N         = other.dimension * other.num_points;
 73:     this->dimension  = other.dimension;
 74:     this->num_points = other.num_points;
 75:     this->pts.resize(N);
 76:     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
 77:   }

 79:   int getDimension() const { return dimension; }

 81:   size_t getDataSetSize() const { return num_points; }

 83:   T getDataPoint(size_t idx, int dim) const
 84:   {
 85:     assert(dim < dimension && idx < num_points);
 86:     return pts[idx * dimension + dim];
 87:   }

 89:   void Print(std::ostream &out = std::cout)
 90:   {
 91:     out << "Dimension: " << dimension << std::endl;
 92:     out << "NumPoints: " << num_points << std::endl;
 93:     for (size_t n = 0; n < num_points; n++) {
 94:       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
 95:       out << std::endl;
 96:     }
 97:   }
 98: };

100: template <class T>
101: class PetscFunctionGenerator {
102: private:
103:   MatH2OpusKernelFn *k;
104:   int                dim;
105:   void              *ctx;

107: public:
108:   PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, PetscCtx ctx)
109:   {
110:     this->k   = k;
111:     this->dim = dim;
112:     this->ctx = ctx;
113:   }
114:   PetscFunctionGenerator(PetscFunctionGenerator &other)
115:   {
116:     this->k   = other.k;
117:     this->dim = other.dim;
118:     this->ctx = other.ctx;
119:   }
120:   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
121: };

123: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

125:   /* just to not clutter the code */
126:   #if !defined(H2OPUS_USE_GPU)
127: typedef HMatrix HMatrix_GPU;
128:     #if defined(H2OPUS_USE_MPI)
129: typedef DistributedHMatrix DistributedHMatrix_GPU;
130:     #endif
131:   #endif

133: typedef struct {
134:   #if defined(H2OPUS_USE_MPI)
135:   distributedH2OpusHandle_t handle;
136:   #else
137:   h2opusHandle_t handle;
138:   #endif
139:   /* Sequential and parallel matrices are two different classes at the moment */
140:   HMatrix *hmatrix;
141:   #if defined(H2OPUS_USE_MPI)
142:   DistributedHMatrix *dist_hmatrix;
143:   #else
144:   HMatrix *dist_hmatrix; /* just to not clutter the code */
145:   #endif
146:   /* May use permutations */
147:   PetscSF                           sf;
148:   PetscLayout                       h2opus_rmap, h2opus_cmap;
149:   IS                                h2opus_indexmap;
150:   thrust::host_vector<PetscScalar> *xx, *yy;
151:   PetscInt                          xxs, yys;
152:   PetscBool                         multsetup;

154:   /* GPU */
155:   HMatrix_GPU *hmatrix_gpu;
156:   #if defined(H2OPUS_USE_MPI)
157:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
158:   #else
159:   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
160:   #endif
161:   #if defined(PETSC_H2OPUS_USE_GPU)
162:   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
163:   PetscInt                            xxs_gpu, yys_gpu;
164:   #endif

166:   /* construction from matvecs */
167:   PetscMatrixSampler *sampler;
168:   PetscBool           nativemult;

170:   /* Admissibility */
171:   PetscReal eta;
172:   PetscInt  leafsize;

174:   /* for dof reordering */
175:   PetscPointCloud<PetscReal> *ptcloud;

177:   /* kernel for generating matrix entries */
178:   PetscFunctionGenerator<PetscScalar> *kernel;

180:   /* basis orthogonalized? */
181:   PetscBool orthogonal;

183:   /* customization */
184:   PetscInt  basisord;
185:   PetscInt  max_rank;
186:   PetscInt  bs;
187:   PetscReal rtol;
188:   PetscInt  norm_max_samples;
189:   PetscBool check_construction;
190:   PetscBool hara_verbose;
191:   PetscBool resize;

193:   /* keeps track of MatScale values */
194:   PetscScalar s;
195: } Mat_H2OPUS;

197: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
198: {
199:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

201:   PetscFunctionBegin;
202:   #if defined(H2OPUS_USE_MPI)
203:   h2opusDestroyDistributedHandle(a->handle);
204:   #else
205:   h2opusDestroyHandle(a->handle);
206:   #endif
207:   delete a->dist_hmatrix;
208:   delete a->hmatrix;
209:   PetscCall(PetscSFDestroy(&a->sf));
210:   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
211:   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
212:   PetscCall(ISDestroy(&a->h2opus_indexmap));
213:   delete a->xx;
214:   delete a->yy;
215:   delete a->hmatrix_gpu;
216:   delete a->dist_hmatrix_gpu;
217:   #if defined(PETSC_H2OPUS_USE_GPU)
218:   delete a->xx_gpu;
219:   delete a->yy_gpu;
220:   #endif
221:   delete a->sampler;
222:   delete a->ptcloud;
223:   delete a->kernel;
224:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
225:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
226:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
227:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
228:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
229:   PetscCall(PetscFree(A->data));
230:   PetscFunctionReturn(PETSC_SUCCESS);
231: }

233: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
234: {
235:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
236:   PetscBool   ish2opus;

238:   PetscFunctionBegin;
241:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
242:   if (ish2opus) {
243:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245:         PetscLayout t;
246:         t              = A->rmap;
247:         A->rmap        = a->h2opus_rmap;
248:         a->h2opus_rmap = t;
249:         t              = A->cmap;
250:         A->cmap        = a->h2opus_cmap;
251:         a->h2opus_cmap = t;
252:       }
253:     }
254:     a->nativemult = nm;
255:   }
256:   PetscFunctionReturn(PETSC_SUCCESS);
257: }

259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
262:   PetscBool   ish2opus;

264:   PetscFunctionBegin;
266:   PetscAssertPointer(nm, 2);
267:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
268:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
269:   *nm = a->nativemult;
270:   PetscFunctionReturn(PETSC_SUCCESS);
271: }

273: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
274: {
275:   PetscBool   ish2opus;
276:   PetscInt    nmax = PETSC_DECIDE;
277:   Mat_H2OPUS *a    = NULL;
278:   PetscBool   mult = PETSC_FALSE;

280:   PetscFunctionBegin;
281:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
282:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
283:     a = (Mat_H2OPUS *)A->data;

285:     nmax = a->norm_max_samples;
286:     mult = a->nativemult;
287:     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
288:   } else {
289:     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
290:   }
291:   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
292:   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
293:   PetscFunctionReturn(PETSC_SUCCESS);
294: }

296: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
297: {
298:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
299:   PetscInt    n;
300:   PetscBool   boundtocpu = PETSC_TRUE;

302:   PetscFunctionBegin;
303:   #if defined(PETSC_H2OPUS_USE_GPU)
304:   boundtocpu = A->boundtocpu;
305:   #endif
306:   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
307:   if (boundtocpu) {
308:     if (h2opus->xxs < xN) {
309:       h2opus->xx->resize(n * xN);
310:       h2opus->xxs = xN;
311:     }
312:     if (h2opus->yys < yN) {
313:       h2opus->yy->resize(n * yN);
314:       h2opus->yys = yN;
315:     }
316:   }
317:   #if defined(PETSC_H2OPUS_USE_GPU)
318:   if (!boundtocpu) {
319:     if (h2opus->xxs_gpu < xN) {
320:       h2opus->xx_gpu->resize(n * xN);
321:       h2opus->xxs_gpu = xN;
322:     }
323:     if (h2opus->yys_gpu < yN) {
324:       h2opus->yy_gpu->resize(n * yN);
325:       h2opus->yys_gpu = yN;
326:     }
327:   }
328:   #endif
329:   PetscFunctionReturn(PETSC_SUCCESS);
330: }

332: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
333: {
334:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
335:   #if defined(H2OPUS_USE_MPI)
336:   h2opusHandle_t handle = h2opus->handle->handle;
337:   #else
338:   h2opusHandle_t handle = h2opus->handle;
339:   #endif
340:   PetscBool    boundtocpu = PETSC_TRUE;
341:   PetscScalar *xx, *yy, *uxx, *uyy;
342:   PetscInt     blda, clda;
343:   PetscMPIInt  size;
344:   PetscSF      bsf, csf;
345:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

347:   PetscFunctionBegin;
348:   HLibProfile::clear();
349:   #if defined(PETSC_H2OPUS_USE_GPU)
350:   boundtocpu = A->boundtocpu;
351:   #endif
352:   PetscCall(MatDenseGetLDA(B, &blda));
353:   PetscCall(MatDenseGetLDA(C, &clda));
354:   if (usesf) {
355:     PetscInt n;

357:     PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
358:     PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));

360:     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
361:     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
362:     blda = n;
363:     clda = n;
364:   }
365:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
366:   if (boundtocpu) {
367:     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
368:     PetscCall(MatDenseGetArrayWrite(C, &yy));
369:     if (usesf) {
370:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
371:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
372:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
373:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374:     } else {
375:       uxx = xx;
376:       uyy = yy;
377:     }
378:     if (size > 1) {
379:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
380:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
381:   #if defined(H2OPUS_USE_MPI)
382:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
383:   #endif
384:     } else {
385:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
386:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
387:     }
388:     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
389:     if (usesf) {
390:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
391:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392:     }
393:     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
394:   #if defined(PETSC_H2OPUS_USE_GPU)
395:   } else {
396:     PetscBool ciscuda, biscuda;

398:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
399:     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
400:     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
401:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
402:     if (!ciscuda) {
403:       C->assembled = PETSC_TRUE;
404:       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
405:     }
406:     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
407:     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
408:     if (usesf) {
409:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
410:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
411:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
412:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413:     } else {
414:       uxx = xx;
415:       uyy = yy;
416:     }
417:     PetscCall(PetscLogGpuTimeBegin());
418:     if (size > 1) {
419:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
420:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
421:     #if defined(H2OPUS_USE_MPI)
422:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
423:     #endif
424:     } else {
425:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
426:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
427:     }
428:     PetscCall(PetscLogGpuTimeEnd());
429:     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
430:     if (usesf) {
431:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
432:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433:     }
434:     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
435:     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
436:     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
437:   #endif
438:   }
439:   { /* log flops */
440:     double gops, time, perf, dev;
441:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
442:   #if defined(PETSC_H2OPUS_USE_GPU)
443:     if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
444:     else PetscCall(PetscLogGpuFlops(1e9 * gops));
445:   #else
446:     PetscCall(PetscLogFlops(1e9 * gops));
447:   #endif
448:   }
449:   PetscFunctionReturn(PETSC_SUCCESS);
450: }

452: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
453: {
454:   Mat_Product *product = C->product;

456:   PetscFunctionBegin;
457:   MatCheckProduct(C, 1);
458:   switch (product->type) {
459:   case MATPRODUCT_AB:
460:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
461:     break;
462:   case MATPRODUCT_AtB:
463:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
464:     break;
465:   default:
466:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
467:   }
468:   PetscFunctionReturn(PETSC_SUCCESS);
469: }

471: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
472: {
473:   Mat_Product *product = C->product;
474:   PetscBool    cisdense;
475:   Mat          A, B;

477:   PetscFunctionBegin;
478:   MatCheckProduct(C, 1);
479:   A = product->A;
480:   B = product->B;
481:   switch (product->type) {
482:   case MATPRODUCT_AB:
483:     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
484:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
485:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
486:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
487:     PetscCall(MatSetUp(C));
488:     break;
489:   case MATPRODUCT_AtB:
490:     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
491:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
492:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
493:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
494:     PetscCall(MatSetUp(C));
495:     break;
496:   default:
497:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
498:   }
499:   C->ops->productsymbolic = NULL;
500:   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
501:   PetscFunctionReturn(PETSC_SUCCESS);
502: }

504: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
505: {
506:   PetscFunctionBegin;
507:   MatCheckProduct(C, 1);
508:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
509:   PetscFunctionReturn(PETSC_SUCCESS);
510: }

512: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
513: {
514:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
515:   #if defined(H2OPUS_USE_MPI)
516:   h2opusHandle_t handle = h2opus->handle->handle;
517:   #else
518:   h2opusHandle_t handle = h2opus->handle;
519:   #endif
520:   PetscBool    boundtocpu = PETSC_TRUE;
521:   PetscInt     n;
522:   PetscScalar *xx, *yy, *uxx, *uyy;
523:   PetscMPIInt  size;
524:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

526:   PetscFunctionBegin;
527:   HLibProfile::clear();
528:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
529:   #if defined(PETSC_H2OPUS_USE_GPU)
530:   boundtocpu = A->boundtocpu;
531:   #endif
532:   if (usesf) PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
533:   else n = A->rmap->n;
534:   if (boundtocpu) {
535:     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
536:     if (sy == 0.0) {
537:       PetscCall(VecGetArrayWrite(y, &yy));
538:     } else {
539:       PetscCall(VecGetArray(y, &yy));
540:     }
541:     if (usesf) {
542:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
543:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

545:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
546:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
547:       if (sy != 0.0) {
548:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
549:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
550:       }
551:     } else {
552:       uxx = xx;
553:       uyy = yy;
554:     }
555:     if (size > 1) {
556:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
557:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
558:   #if defined(H2OPUS_USE_MPI)
559:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
560:   #endif
561:     } else {
562:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
563:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
564:     }
565:     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
566:     if (usesf) {
567:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
568:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
569:     }
570:     if (sy == 0.0) {
571:       PetscCall(VecRestoreArrayWrite(y, &yy));
572:     } else {
573:       PetscCall(VecRestoreArray(y, &yy));
574:     }
575:   #if defined(PETSC_H2OPUS_USE_GPU)
576:   } else {
577:     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
578:     if (sy == 0.0) {
579:       PetscCall(VecCUDAGetArrayWrite(y, &yy));
580:     } else {
581:       PetscCall(VecCUDAGetArray(y, &yy));
582:     }
583:     if (usesf) {
584:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
585:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

587:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
588:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
589:       if (sy != 0.0) {
590:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
591:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
592:       }
593:     } else {
594:       uxx = xx;
595:       uyy = yy;
596:     }
597:     PetscCall(PetscLogGpuTimeBegin());
598:     if (size > 1) {
599:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
600:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
601:     #if defined(H2OPUS_USE_MPI)
602:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
603:     #endif
604:     } else {
605:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
606:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
607:     }
608:     PetscCall(PetscLogGpuTimeEnd());
609:     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
610:     if (usesf) {
611:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
612:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
613:     }
614:     if (sy == 0.0) {
615:       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
616:     } else {
617:       PetscCall(VecCUDARestoreArray(y, &yy));
618:     }
619:   #endif
620:   }
621:   { /* log flops */
622:     double gops, time, perf, dev;
623:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
624:   #if defined(PETSC_H2OPUS_USE_GPU)
625:     if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
626:     else PetscCall(PetscLogGpuFlops(1e9 * gops));
627:   #else
628:     PetscCall(PetscLogFlops(1e9 * gops));
629:   #endif
630:   }
631:   PetscFunctionReturn(PETSC_SUCCESS);
632: }

634: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
635: {
636:   PetscBool xiscuda, yiscuda;

638:   PetscFunctionBegin;
639:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
640:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
641:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
642:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
643:   PetscFunctionReturn(PETSC_SUCCESS);
644: }

646: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
647: {
648:   PetscBool xiscuda, yiscuda;

650:   PetscFunctionBegin;
651:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
652:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
653:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
654:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
655:   PetscFunctionReturn(PETSC_SUCCESS);
656: }

658: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
659: {
660:   PetscBool xiscuda, ziscuda;

662:   PetscFunctionBegin;
663:   PetscCall(VecCopy(y, z));
664:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
665:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
666:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
667:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
668:   PetscFunctionReturn(PETSC_SUCCESS);
669: }

671: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
672: {
673:   PetscBool xiscuda, ziscuda;

675:   PetscFunctionBegin;
676:   PetscCall(VecCopy(y, z));
677:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
678:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
679:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
680:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
681:   PetscFunctionReturn(PETSC_SUCCESS);
682: }

684: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
685: {
686:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

688:   PetscFunctionBegin;
689:   a->s *= s;
690:   PetscFunctionReturn(PETSC_SUCCESS);
691: }

693: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems PetscOptionsObject)
694: {
695:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

697:   PetscFunctionBegin;
698:   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
699:   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
700:   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
701:   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
702:   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
703:   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
704:   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
705:   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
706:   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
707:   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
708:   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
709:   PetscOptionsHeadEnd();
710:   PetscFunctionReturn(PETSC_SUCCESS);
711: }

713: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);

715: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
716: {
717:   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
718:   Vec                c;
719:   PetscInt           spacedim;
720:   const PetscScalar *coords;

722:   PetscFunctionBegin;
723:   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
724:   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
725:   if (!c && a->sampler) {
726:     Mat S = a->sampler->GetSamplingMat();

728:     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
729:   }
730:   if (!c) {
731:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
732:   } else {
733:     PetscCall(VecGetArrayRead(c, &coords));
734:     PetscCall(VecGetBlockSize(c, &spacedim));
735:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
736:     PetscCall(VecRestoreArrayRead(c, &coords));
737:   }
738:   PetscFunctionReturn(PETSC_SUCCESS);
739: }

741: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
742: {
743:   MPI_Comm      comm;
744:   PetscMPIInt   size;
745:   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
746:   PetscInt      n = 0, *idx = NULL;
747:   int          *iidx = NULL;
748:   PetscCopyMode own;
749:   PetscBool     rid;

751:   PetscFunctionBegin;
752:   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
753:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
754:     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
755:   #if defined(PETSC_H2OPUS_USE_GPU)
756:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
757:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
758:     a->xxs_gpu = 1;
759:     a->yys_gpu = 1;
760:   #endif
761:     a->xx  = new thrust::host_vector<PetscScalar>(n);
762:     a->yy  = new thrust::host_vector<PetscScalar>(n);
763:     a->xxs = 1;
764:     a->yys = 1;
765:   } else {
766:     IS is;
767:     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
768:     PetscCallMPI(MPI_Comm_size(comm, &size));
769:     if (!a->h2opus_indexmap) {
770:       if (size > 1) {
771:         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
772:   #if defined(H2OPUS_USE_MPI)
773:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
774:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
775:   #endif
776:       } else {
777:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
778:         n    = a->hmatrix->u_basis_tree.index_map.size();
779:       }

781:       if (PetscDefined(USE_64BIT_INDICES)) {
782:         PetscInt i;

784:         own = PETSC_OWN_POINTER;
785:         PetscCall(PetscMalloc1(n, &idx));
786:         for (i = 0; i < n; i++) idx[i] = iidx[i];
787:       } else {
788:         own = PETSC_COPY_VALUES;
789:         idx = (PetscInt *)iidx;
790:       }
791:       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
792:       PetscCall(ISSetPermutation(is));
793:       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
794:       a->h2opus_indexmap = is;
795:     }
796:     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
797:     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
798:     rid = (PetscBool)(n == A->rmap->n);
799:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPI_C_BOOL, MPI_LAND, comm));
800:     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
801:     if (!rid) {
802:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
803:         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
804:         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
805:         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
806:         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
807:       }
808:       PetscCall(PetscSFCreate(comm, &a->sf));
809:       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
810:       PetscCall(PetscSFSetUp(a->sf));
811:       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
812:   #if defined(PETSC_H2OPUS_USE_GPU)
813:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
814:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
815:       a->xxs_gpu = 1;
816:       a->yys_gpu = 1;
817:   #endif
818:       a->xx  = new thrust::host_vector<PetscScalar>(n);
819:       a->yy  = new thrust::host_vector<PetscScalar>(n);
820:       a->xxs = 1;
821:       a->yys = 1;
822:     }
823:     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
824:   }
825:   a->multsetup = PETSC_TRUE;
826:   PetscFunctionReturn(PETSC_SUCCESS);
827: }

829: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
830: {
831:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
832:   #if defined(H2OPUS_USE_MPI)
833:   h2opusHandle_t handle = a->handle->handle;
834:   #else
835:   h2opusHandle_t handle = a->handle;
836:   #endif
837:   PetscBool   kernel       = PETSC_FALSE;
838:   PetscBool   boundtocpu   = PETSC_TRUE;
839:   PetscBool   samplingdone = PETSC_FALSE;
840:   MPI_Comm    comm;
841:   PetscMPIInt size;

843:   PetscFunctionBegin;
844:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
845:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
846:   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");

848:   /* XXX */
849:   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));

851:   PetscCallMPI(MPI_Comm_size(comm, &size));
852:   /* TODO REUSABILITY of geometric construction */
853:   delete a->hmatrix;
854:   delete a->dist_hmatrix;
855:   #if defined(PETSC_H2OPUS_USE_GPU)
856:   delete a->hmatrix_gpu;
857:   delete a->dist_hmatrix_gpu;
858:   #endif
859:   a->orthogonal = PETSC_FALSE;

861:   /* TODO: other? */
862:   H2OpusBoxCenterAdmissibility adm(a->eta);

864:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
865:   if (size > 1) {
866:   #if defined(H2OPUS_USE_MPI)
867:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
868:   #else
869:     a->dist_hmatrix = NULL;
870:   #endif
871:   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
872:   PetscCall(MatH2OpusInferCoordinates_Private(A));
873:   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
874:   if (a->kernel) {
875:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
876:     if (size > 1) {
877:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
878:   #if defined(H2OPUS_USE_MPI)
879:       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
880:   #endif
881:     } else {
882:       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
883:     }
884:     kernel = PETSC_TRUE;
885:   } else {
886:     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
887:     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
888:   }
889:   PetscCall(MatSetUpMultiply_H2OPUS(A));

891:   #if defined(PETSC_H2OPUS_USE_GPU)
892:   boundtocpu = A->boundtocpu;
893:   if (!boundtocpu) {
894:     if (size > 1) {
895:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
896:     #if defined(H2OPUS_USE_MPI)
897:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
898:     #endif
899:     } else {
900:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
901:     }
902:   }
903:   #endif
904:   if (size == 1) {
905:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
906:       PetscReal Anorm;
907:       bool      verbose;

909:       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
910:       verbose = a->hara_verbose;
911:       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
912:       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
913:       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
914:       a->sampler->SetStream(handle->getMainStream());
915:       if (boundtocpu) {
916:         a->sampler->SetGPUSampling(false);
917:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
918:   #if defined(PETSC_H2OPUS_USE_GPU)
919:       } else {
920:         a->sampler->SetGPUSampling(true);
921:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
922:   #endif
923:       }
924:       samplingdone = PETSC_TRUE;
925:     }
926:   }
927:   #if defined(PETSC_H2OPUS_USE_GPU)
928:   if (!boundtocpu) {
929:     delete a->hmatrix;
930:     delete a->dist_hmatrix;
931:     a->hmatrix      = NULL;
932:     a->dist_hmatrix = NULL;
933:   }
934:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
935:   #endif
936:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));

938:   if (!a->s) a->s = 1.0;
939:   A->assembled = PETSC_TRUE;

941:   if (samplingdone) {
942:     PetscBool check  = a->check_construction;
943:     PetscBool checke = PETSC_FALSE;

945:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
946:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
947:     if (check) {
948:       Mat               E, Ae;
949:       PetscReal         n1, ni, n2;
950:       PetscReal         n1A, niA, n2A;
951:       PetscErrorCodeFn *normfunc;

953:       Ae = a->sampler->GetSamplingMat();
954:       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
955:       PetscCall(MatShellSetOperation(E, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
956:       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
957:       PetscCall(MatNorm(E, NORM_1, &n1));
958:       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
959:       PetscCall(MatNorm(E, NORM_2, &n2));
960:       if (checke) {
961:         Mat eA, eE, eAe;

963:         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
964:         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
965:         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
966:         PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
967:         PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
968:         PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
969:         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
970:         PetscCall(MatView(eA, NULL));
971:         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
972:         PetscCall(MatView(eAe, NULL));
973:         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
974:         PetscCall(MatView(eE, NULL));
975:         PetscCall(MatDestroy(&eA));
976:         PetscCall(MatDestroy(&eE));
977:         PetscCall(MatDestroy(&eAe));
978:       }

980:       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
981:       PetscCall(MatSetOperation(Ae, MATOP_NORM, (PetscErrorCodeFn *)MatNorm_H2OPUS));
982:       PetscCall(MatNorm(Ae, NORM_1, &n1A));
983:       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
984:       PetscCall(MatNorm(Ae, NORM_2, &n2A));
985:       n1A = PetscMax(n1A, PETSC_SMALL);
986:       n2A = PetscMax(n2A, PETSC_SMALL);
987:       niA = PetscMax(niA, PETSC_SMALL);
988:       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
989:       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
990:       PetscCall(MatDestroy(&E));
991:     }
992:     a->sampler->SetSamplingMat(NULL);
993:   }
994:   PetscFunctionReturn(PETSC_SUCCESS);
995: }

997: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
998: {
999:   PetscMPIInt size;
1000:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1002:   PetscFunctionBegin;
1003:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1004:   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1005:   a->hmatrix->clearData();
1006:   #if defined(PETSC_H2OPUS_USE_GPU)
1007:   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1008:   #endif
1009:   PetscFunctionReturn(PETSC_SUCCESS);
1010: }

1012: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1013: {
1014:   Mat         A;
1015:   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1016:   PetscBool   iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;
1017:   MPI_Comm    comm;

1019:   PetscFunctionBegin;
1020:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1021:   PetscCall(MatCreate(comm, &A));
1022:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1023:   PetscCall(MatSetType(A, MATH2OPUS));
1024:   PetscCall(MatPropagateSymmetryOptions(B, A));
1025:   a = (Mat_H2OPUS *)A->data;

1027:   a->eta              = b->eta;
1028:   a->leafsize         = b->leafsize;
1029:   a->basisord         = b->basisord;
1030:   a->max_rank         = b->max_rank;
1031:   a->bs               = b->bs;
1032:   a->rtol             = b->rtol;
1033:   a->norm_max_samples = b->norm_max_samples;
1034:   if (op == MAT_COPY_VALUES) a->s = b->s;

1036:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1037:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1039:   #if defined(H2OPUS_USE_MPI)
1040:   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1041:     #if defined(PETSC_H2OPUS_USE_GPU)
1042:   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1043:     #endif
1044:   #endif
1045:   if (b->hmatrix) {
1046:     a->hmatrix = new HMatrix(*b->hmatrix);
1047:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1048:   }
1049:   #if defined(PETSC_H2OPUS_USE_GPU)
1050:   if (b->hmatrix_gpu) {
1051:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1052:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1053:   }
1054:   #endif
1055:   if (b->sf) {
1056:     PetscCall(PetscObjectReference((PetscObject)b->sf));
1057:     a->sf = b->sf;
1058:   }
1059:   if (b->h2opus_indexmap) {
1060:     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1061:     a->h2opus_indexmap = b->h2opus_indexmap;
1062:   }

1064:   PetscCall(MatSetUp(A));
1065:   PetscCall(MatSetUpMultiply_H2OPUS(A));
1066:   if (op == MAT_COPY_VALUES) {
1067:     A->assembled  = PETSC_TRUE;
1068:     a->orthogonal = b->orthogonal;
1069:   #if defined(PETSC_H2OPUS_USE_GPU)
1070:     A->offloadmask = B->offloadmask;
1071:   #endif
1072:   }
1073:   #if defined(PETSC_H2OPUS_USE_GPU)
1074:   iscpu = B->boundtocpu;
1075:   #endif
1076:   PetscCall(MatBindToCPU(A, iscpu));

1078:   *nA = A;
1079:   PetscFunctionReturn(PETSC_SUCCESS);
1080: }

1082: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1083: {
1084:   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1085:   PetscBool         isascii, vieweps;
1086:   PetscMPIInt       size;
1087:   PetscViewerFormat format;

1089:   PetscFunctionBegin;
1090:   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1091:   PetscCall(PetscViewerGetFormat(view, &format));
1092:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1093:   if (isascii) {
1094:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1095:       if (size == 1) {
1096:         FILE *fp;
1097:         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1098:         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1099:       }
1100:     } else {
1101:       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1102:       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1103:       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1104:       if (!h2opus->kernel) {
1105:         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1106:       } else {
1107:         PetscCall(PetscViewerASCIIPrintf(view, "  Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1108:       }
1109:       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1110:       if (size == 1) {
1111:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1112:         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1113:   #if defined(PETSC_HAVE_CUDA)
1114:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1115:         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1116:   #endif
1117:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1118:   #if defined(PETSC_HAVE_CUDA)
1119:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1120:   #endif
1121:       } else {
1122:   #if defined(PETSC_HAVE_CUDA)
1123:         double      matrix_mem[4] = {0., 0., 0., 0.};
1124:         PetscMPIInt rsize         = 4;
1125:   #else
1126:         double      matrix_mem[2] = {0., 0.};
1127:         PetscMPIInt rsize         = 2;
1128:   #endif
1129:   #if defined(H2OPUS_USE_MPI)
1130:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1131:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1132:     #if defined(PETSC_HAVE_CUDA)
1133:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1134:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1135:     #endif
1136:   #endif
1137:         PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1138:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1139:   #if defined(PETSC_HAVE_CUDA)
1140:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1141:   #endif
1142:       }
1143:     }
1144:   }
1145:   vieweps = PETSC_FALSE;
1146:   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1147:   if (vieweps) {
1148:     char        filename[256];
1149:     const char *name;

1151:     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1152:     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1153:     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1154:     outputEps(*h2opus->hmatrix, filename);
1155:   }
1156:   PetscFunctionReturn(PETSC_SUCCESS);
1157: }

1159: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1160: {
1161:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1162:   PetscReal  *gcoords;
1163:   PetscInt    N;
1164:   MPI_Comm    comm;
1165:   PetscMPIInt size;
1166:   PetscBool   cong;

1168:   PetscFunctionBegin;
1169:   PetscCall(PetscLayoutSetUp(A->rmap));
1170:   PetscCall(PetscLayoutSetUp(A->cmap));
1171:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1172:   PetscCall(MatHasCongruentLayouts(A, &cong));
1173:   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1174:   N = A->rmap->N;
1175:   PetscCallMPI(MPI_Comm_size(comm, &size));
1176:   if (spacedim > 0 && size > 1 && cdist) {
1177:     PetscSF      sf;
1178:     MPI_Datatype dtype;

1180:     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1181:     PetscCallMPI(MPI_Type_commit(&dtype));

1183:     PetscCall(PetscSFCreate(comm, &sf));
1184:     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1185:     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1186:     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1187:     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1188:     PetscCall(PetscSFDestroy(&sf));
1189:     PetscCallMPI(MPI_Type_free(&dtype));
1190:   } else gcoords = (PetscReal *)coords;

1192:   delete h2opus->ptcloud;
1193:   delete h2opus->kernel;
1194:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1195:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1196:   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1197:   A->preallocated = PETSC_TRUE;
1198:   PetscFunctionReturn(PETSC_SUCCESS);
1199: }

1201:   #if defined(PETSC_H2OPUS_USE_GPU)
1202: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1203: {
1204:   PetscMPIInt size;
1205:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1207:   PetscFunctionBegin;
1208:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1209:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1210:     if (size > 1) {
1211:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1212:     #if defined(H2OPUS_USE_MPI)
1213:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1214:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1215:     #endif
1216:     } else {
1217:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1218:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1219:       else *a->hmatrix = *a->hmatrix_gpu;
1220:     }
1221:     delete a->hmatrix_gpu;
1222:     delete a->dist_hmatrix_gpu;
1223:     a->hmatrix_gpu      = NULL;
1224:     a->dist_hmatrix_gpu = NULL;
1225:     A->offloadmask      = PETSC_OFFLOAD_CPU;
1226:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1227:     if (size > 1) {
1228:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1229:     #if defined(H2OPUS_USE_MPI)
1230:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1231:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1232:     #endif
1233:     } else {
1234:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1235:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1236:       else *a->hmatrix_gpu = *a->hmatrix;
1237:     }
1238:     delete a->hmatrix;
1239:     delete a->dist_hmatrix;
1240:     a->hmatrix      = NULL;
1241:     a->dist_hmatrix = NULL;
1242:     A->offloadmask  = PETSC_OFFLOAD_GPU;
1243:   }
1244:   PetscCall(PetscFree(A->defaultvectype));
1245:   if (!flg) {
1246:     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1247:   } else {
1248:     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1249:   }
1250:   A->boundtocpu = flg;
1251:   PetscFunctionReturn(PETSC_SUCCESS);
1252: }
1253:   #endif

1255: /*MC
1256:    MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.

1258:    Options Database Key:
1259: .  -mat_type h2opus - matrix type to "h2opus"

1261:    Level: beginner

1263:    Notes:
1264:    H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.

1266:    For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1267:    In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.

1269: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1270: M*/
1271: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1272: {
1273:   Mat_H2OPUS *a;
1274:   PetscMPIInt size;

1276:   PetscFunctionBegin;
1277:   #if defined(PETSC_H2OPUS_USE_GPU)
1278:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1279:   #endif
1280:   PetscCall(PetscNew(&a));
1281:   A->data = (void *)a;

1283:   a->eta              = 0.9;
1284:   a->leafsize         = 32;
1285:   a->basisord         = 4;
1286:   a->max_rank         = 64;
1287:   a->bs               = 32;
1288:   a->rtol             = 1.e-4;
1289:   a->s                = 1.0;
1290:   a->norm_max_samples = 10;
1291:   a->resize           = PETSC_TRUE; /* reallocate after compression */
1292:   #if defined(H2OPUS_USE_MPI)
1293:   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1294:   #else
1295:   h2opusCreateHandle(&a->handle);
1296:   #endif
1297:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1298:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1299:   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));

1301:   A->ops->destroy          = MatDestroy_H2OPUS;
1302:   A->ops->view             = MatView_H2OPUS;
1303:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1304:   A->ops->mult             = MatMult_H2OPUS;
1305:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1306:   A->ops->multadd          = MatMultAdd_H2OPUS;
1307:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1308:   A->ops->scale            = MatScale_H2OPUS;
1309:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1310:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1311:   A->ops->norm             = MatNorm_H2OPUS;
1312:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1313:   #if defined(PETSC_H2OPUS_USE_GPU)
1314:   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1315:   #endif

1317:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1318:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1319:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1320:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1321:   #if defined(PETSC_H2OPUS_USE_GPU)
1322:   PetscCall(PetscFree(A->defaultvectype));
1323:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1324:   #endif
1325:   PetscFunctionReturn(PETSC_SUCCESS);
1326: }

1328: /*@
1329:   MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1331:   Input Parameter:
1332: . A - the matrix

1334:   Level: intermediate

1336: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1337: @*/
1338: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1339: {
1340:   PetscBool   ish2opus;
1341:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1342:   PetscMPIInt size;
1343:   PetscBool   boundtocpu = PETSC_TRUE;

1345:   PetscFunctionBegin;
1348:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1349:   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1350:   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1351:   HLibProfile::clear();
1352:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1353:   #if defined(PETSC_H2OPUS_USE_GPU)
1354:   boundtocpu = A->boundtocpu;
1355:   #endif
1356:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1357:   if (size > 1) {
1358:     if (boundtocpu) {
1359:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1360:   #if defined(H2OPUS_USE_MPI)
1361:       distributed_horthog(*a->dist_hmatrix, a->handle);
1362:   #endif
1363:   #if defined(PETSC_H2OPUS_USE_GPU)
1364:       A->offloadmask = PETSC_OFFLOAD_CPU;
1365:     } else {
1366:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1367:       PetscCall(PetscLogGpuTimeBegin());
1368:     #if defined(H2OPUS_USE_MPI)
1369:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1370:     #endif
1371:       PetscCall(PetscLogGpuTimeEnd());
1372:   #endif
1373:     }
1374:   } else {
1375:   #if defined(H2OPUS_USE_MPI)
1376:     h2opusHandle_t handle = a->handle->handle;
1377:   #else
1378:     h2opusHandle_t handle = a->handle;
1379:   #endif
1380:     if (boundtocpu) {
1381:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1382:       horthog(*a->hmatrix, handle);
1383:   #if defined(PETSC_H2OPUS_USE_GPU)
1384:       A->offloadmask = PETSC_OFFLOAD_CPU;
1385:     } else {
1386:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1387:       PetscCall(PetscLogGpuTimeBegin());
1388:       horthog(*a->hmatrix_gpu, handle);
1389:       PetscCall(PetscLogGpuTimeEnd());
1390:   #endif
1391:     }
1392:   }
1393:   a->orthogonal = PETSC_TRUE;
1394:   { /* log flops */
1395:     double gops, time, perf, dev;
1396:     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1397:   #if defined(PETSC_H2OPUS_USE_GPU)
1398:     if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
1399:     else PetscCall(PetscLogGpuFlops(1e9 * gops));
1400:   #else
1401:     PetscCall(PetscLogFlops(1e9 * gops));
1402:   #endif
1403:   }
1404:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1405:   PetscFunctionReturn(PETSC_SUCCESS);
1406: }

1408: /*@
1409:   MatH2OpusCompress - Compress a hierarchical matrix.

1411:   Input Parameters:
1412: + A   - the matrix
1413: - tol - the absolute truncation threshold

1415:   Level: intermediate

1417: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1418: @*/
1419: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1420: {
1421:   PetscBool   ish2opus;
1422:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1423:   PetscMPIInt size;
1424:   PetscBool   boundtocpu = PETSC_TRUE;

1426:   PetscFunctionBegin;
1430:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1431:   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1432:   PetscCall(MatH2OpusOrthogonalize(A));
1433:   HLibProfile::clear();
1434:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1435:   #if defined(PETSC_H2OPUS_USE_GPU)
1436:   boundtocpu = A->boundtocpu;
1437:   #endif
1438:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1439:   if (size > 1) {
1440:     if (boundtocpu) {
1441:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1442:   #if defined(H2OPUS_USE_MPI)
1443:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1444:       if (a->resize) {
1445:         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1446:         delete a->dist_hmatrix;
1447:         a->dist_hmatrix = dist_hmatrix;
1448:       }
1449:   #endif
1450:   #if defined(PETSC_H2OPUS_USE_GPU)
1451:       A->offloadmask = PETSC_OFFLOAD_CPU;
1452:     } else {
1453:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1454:       PetscCall(PetscLogGpuTimeBegin());
1455:     #if defined(H2OPUS_USE_MPI)
1456:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);

1458:       if (a->resize) {
1459:         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1460:         delete a->dist_hmatrix_gpu;
1461:         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1462:       }
1463:     #endif
1464:       PetscCall(PetscLogGpuTimeEnd());
1465:   #endif
1466:     }
1467:   } else {
1468:   #if defined(H2OPUS_USE_MPI)
1469:     h2opusHandle_t handle = a->handle->handle;
1470:   #else
1471:     h2opusHandle_t handle = a->handle;
1472:   #endif
1473:     if (boundtocpu) {
1474:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1475:       hcompress(*a->hmatrix, tol, handle);

1477:       if (a->resize) {
1478:         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1479:         delete a->hmatrix;
1480:         a->hmatrix = hmatrix;
1481:       }
1482:   #if defined(PETSC_H2OPUS_USE_GPU)
1483:       A->offloadmask = PETSC_OFFLOAD_CPU;
1484:     } else {
1485:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1486:       PetscCall(PetscLogGpuTimeBegin());
1487:       hcompress(*a->hmatrix_gpu, tol, handle);
1488:       PetscCall(PetscLogGpuTimeEnd());

1490:       if (a->resize) {
1491:         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1492:         delete a->hmatrix_gpu;
1493:         a->hmatrix_gpu = hmatrix_gpu;
1494:       }
1495:   #endif
1496:     }
1497:   }
1498:   { /* log flops */
1499:     double gops, time, perf, dev;
1500:     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1501:   #if defined(PETSC_H2OPUS_USE_GPU)
1502:     if (boundtocpu) PetscCall(PetscLogFlops(1e9 * gops));
1503:     else PetscCall(PetscLogGpuFlops(1e9 * gops));
1504:   #else
1505:     PetscCall(PetscLogFlops(1e9 * gops));
1506:   #endif
1507:   }
1508:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1509:   PetscFunctionReturn(PETSC_SUCCESS);
1510: }

1512: /*@
1513:   MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.

1515:   Input Parameters:
1516: + A   - the hierarchical matrix
1517: . B   - the matrix to be sampled
1518: . bs  - maximum number of samples to be taken concurrently
1519: - tol - relative tolerance for construction

1521:   Level: intermediate

1523:   Notes:
1524:   You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.

1526: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1527: @*/
1528: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1529: {
1530:   PetscBool ish2opus;

1532:   PetscFunctionBegin;
1538:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1539:   if (ish2opus) {
1540:     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1542:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1543:     a->sampler->SetSamplingMat(B);
1544:     if (bs > 0) a->bs = bs;
1545:     if (tol > 0.) a->rtol = tol;
1546:     delete a->kernel;
1547:   }
1548:   PetscFunctionReturn(PETSC_SUCCESS);
1549: }

1551: /*@C
1552:   MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.

1554:   Input Parameters:
1555: + comm      - MPI communicator
1556: . m         - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1557: . n         - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1558: . M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1559: . N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1560: . spacedim  - dimension of the space coordinates
1561: . coords    - coordinates of the points
1562: . cdist     - whether or not coordinates are distributed
1563: . kernel    - computational kernel (or `NULL`)
1564: . kernelctx - kernel context
1565: . eta       - admissibility condition tolerance
1566: . leafsize  - leaf size in cluster tree
1567: - basisord  - approximation order for Chebychev interpolation of low-rank blocks

1569:   Output Parameter:
1570: . nA - matrix

1572:   Options Database Keys:
1573: + -mat_h2opus_leafsize <`PetscInt`>    - Leaf size of cluster tree
1574: . -mat_h2opus_eta <`PetscReal`>        - Admissibility condition tolerance
1575: . -mat_h2opus_order <`PetscInt`>       - Chebychev approximation order
1576: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms

1578:   Level: intermediate

1580: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1581: @*/
1582: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1583: {
1584:   Mat         A;
1585:   Mat_H2OPUS *h2opus;
1586:   PetscBool   iscpu = PetscDefined(H2OPUS_USE_GPU) ? PETSC_FALSE : PETSC_TRUE;

1588:   PetscFunctionBegin;
1589:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1590:   PetscCall(MatCreate(comm, &A));
1591:   PetscCall(MatSetSizes(A, m, n, M, N));
1592:   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1593:   PetscCall(MatSetType(A, MATH2OPUS));
1594:   PetscCall(MatBindToCPU(A, iscpu));
1595:   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));

1597:   h2opus = (Mat_H2OPUS *)A->data;
1598:   if (eta > 0.) h2opus->eta = eta;
1599:   if (leafsize > 0) h2opus->leafsize = leafsize;
1600:   if (basisord > 0) h2opus->basisord = basisord;

1602:   *nA = A;
1603:   PetscFunctionReturn(PETSC_SUCCESS);
1604: }

1606: /*@
1607:   MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.

1609:   Input Parameters:
1610: + B        - the matrix to be sampled
1611: . spacedim - dimension of the space coordinates
1612: . coords   - coordinates of the points
1613: . cdist    - whether or not coordinates are distributed
1614: . eta      - admissibility condition tolerance
1615: . leafsize - leaf size in cluster tree
1616: . maxrank  - maximum rank allowed
1617: . bs       - maximum number of samples to be taken concurrently
1618: - rtol     - relative tolerance for construction

1620:   Output Parameter:
1621: . nA - matrix

1623:   Options Database Keys:
1624: + -mat_h2opus_leafsize <`PetscInt`>      - Leaf size of cluster tree
1625: . -mat_h2opus_eta <`PetscReal`>          - Admissibility condition tolerance
1626: . -mat_h2opus_maxrank <`PetscInt`>       - Maximum rank when constructed from matvecs
1627: . -mat_h2opus_samples <`PetscInt`>       - Maximum number of samples to be taken concurrently when constructing from matvecs
1628: . -mat_h2opus_rtol <`PetscReal`>         - Relative tolerance for construction from sampling
1629: . -mat_h2opus_check <`PetscBool`>        - Check error when constructing from sampling during MatAssemblyEnd()
1630: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1631: - -mat_h2opus_normsamples <`PetscInt`>   - Maximum number of samples to be when estimating norms

1633:   Level: intermediate

1635:   Note:
1636:   Not available in parallel

1638: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1639: @*/
1640: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1641: {
1642:   Mat         A;
1643:   Mat_H2OPUS *h2opus;
1644:   MPI_Comm    comm;
1645:   PetscBool   boundtocpu = PETSC_TRUE;

1647:   PetscFunctionBegin;
1656:   PetscAssertPointer(nA, 10);
1657:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1658:   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1659:   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1660:   PetscCall(MatCreate(comm, &A));
1661:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1662:   #if defined(PETSC_H2OPUS_USE_GPU)
1663:   {
1664:     VecType   vtype;
1665:     PetscBool isstd, iscuda, iskok;

1667:     PetscCall(MatGetVecType(B, &vtype));
1668:     PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1669:     PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1670:     PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1671:     PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1672:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1673:     if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1674:   }
1675:   #endif
1676:   PetscCall(MatSetType(A, MATH2OPUS));
1677:   PetscCall(MatBindToCPU(A, boundtocpu));
1678:   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1679:   PetscCall(MatPropagateSymmetryOptions(B, A));
1680:   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1682:   h2opus          = (Mat_H2OPUS *)A->data;
1683:   h2opus->sampler = new PetscMatrixSampler(B);
1684:   if (eta > 0.) h2opus->eta = eta;
1685:   if (leafsize > 0) h2opus->leafsize = leafsize;
1686:   if (maxrank > 0) h2opus->max_rank = maxrank;
1687:   if (bs > 0) h2opus->bs = bs;
1688:   if (rtol > 0.) h2opus->rtol = rtol;
1689:   *nA             = A;
1690:   A->preallocated = PETSC_TRUE;
1691:   PetscFunctionReturn(PETSC_SUCCESS);
1692: }

1694: /*@
1695:   MatH2OpusGetIndexMap - Access reordering index set.

1697:   Input Parameter:
1698: . A - the matrix

1700:   Output Parameter:
1701: . indexmap - the index set for the reordering

1703:   Level: intermediate

1705: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1706: @*/
1707: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1708: {
1709:   PetscBool   ish2opus;
1710:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1712:   PetscFunctionBegin;
1715:   PetscAssertPointer(indexmap, 2);
1716:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1717:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1718:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1719:   *indexmap = a->h2opus_indexmap;
1720:   PetscFunctionReturn(PETSC_SUCCESS);
1721: }

1723: /*@
1724:   MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1726:   Input Parameters:
1727: + A             - the matrix
1728: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1729: - in            - the vector to be mapped

1731:   Output Parameter:
1732: . out - the newly created mapped vector

1734:   Level: intermediate

1736: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1737: @*/
1738: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1739: {
1740:   PetscBool    ish2opus;
1741:   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1742:   PetscScalar *xin, *xout;
1743:   PetscBool    nm;

1745:   PetscFunctionBegin;
1750:   PetscAssertPointer(out, 4);
1751:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1752:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1753:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1754:   nm = a->nativemult;
1755:   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1756:   PetscCall(MatCreateVecs(A, out, NULL));
1757:   PetscCall(MatH2OpusSetNativeMult(A, nm));
1758:   if (!a->sf) { /* same ordering */
1759:     PetscCall(VecCopy(in, *out));
1760:     PetscFunctionReturn(PETSC_SUCCESS);
1761:   }
1762:   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1763:   PetscCall(VecGetArrayWrite(*out, &xout));
1764:   if (nativetopetsc) {
1765:     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1766:     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1767:   } else {
1768:     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1769:     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1770:   }
1771:   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1772:   PetscCall(VecRestoreArrayWrite(*out, &xout));
1773:   PetscFunctionReturn(PETSC_SUCCESS);
1774: }

1776: /*@
1777:   MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $

1779:   Input Parameters:
1780: + A - the hierarchical `MATH2OPUS` matrix
1781: . s - the scaling factor
1782: . U - the dense low-rank update matrix
1783: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)

1785:   Note:
1786:   The `U` and `V` matrices must be in `MATDENSE` dense format

1788:   Level: intermediate

1790: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1791: @*/
1792: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1793: {
1794:   PetscBool flg;

1796:   PetscFunctionBegin;
1799:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1801:   PetscCheckSameComm(A, 1, U, 2);
1802:   if (V) {
1804:     PetscCheckSameComm(A, 1, V, 3);
1805:   }

1808:   if (!V) V = U;
1809:   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1810:   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1811:   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1812:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1813:   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1814:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1815:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1816:   if (flg) {
1817:     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1818:     const PetscScalar *u, *v, *uu, *vv;
1819:     PetscInt           ldu, ldv;
1820:     PetscMPIInt        size;
1821:   #if defined(H2OPUS_USE_MPI)
1822:     h2opusHandle_t handle = a->handle->handle;
1823:   #else
1824:     h2opusHandle_t handle = a->handle;
1825:   #endif
1826:     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1827:     PetscSF   usf, vsf;

1829:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1830:     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1831:     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1832:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1833:     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1834:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1835:     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1836:     PetscCall(MatDenseGetLDA(U, &ldu));
1837:     PetscCall(MatDenseGetLDA(V, &ldv));
1838:     PetscCall(MatBoundToCPU(A, &flg));
1839:     if (usesf) {
1840:       PetscInt n;

1842:       PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1843:       PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1844:       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1845:       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1846:       ldu = n;
1847:       ldv = n;
1848:     }
1849:     if (flg) {
1850:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1851:       PetscCall(MatDenseGetArrayRead(U, &u));
1852:       PetscCall(MatDenseGetArrayRead(V, &v));
1853:       if (usesf) {
1854:         vv = MatH2OpusGetThrustPointer(*a->yy);
1855:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1856:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1857:         if (U != V) {
1858:           uu = MatH2OpusGetThrustPointer(*a->xx);
1859:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1860:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1861:         } else uu = vv;
1862:       } else {
1863:         uu = u;
1864:         vv = v;
1865:       }
1866:       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1867:       PetscCall(MatDenseRestoreArrayRead(U, &u));
1868:       PetscCall(MatDenseRestoreArrayRead(V, &v));
1869:     } else {
1870:   #if defined(PETSC_H2OPUS_USE_GPU)
1871:       PetscBool flgU, flgV;

1873:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1874:       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1875:       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1876:       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1877:       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1878:       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1879:       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1880:       if (usesf) {
1881:         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1882:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1883:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1884:         if (U != V) {
1885:           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1886:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1887:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1888:         } else uu = vv;
1889:       } else {
1890:         uu = u;
1891:         vv = v;
1892:       }
1893:   #else
1894:       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1895:   #endif
1896:       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1897:   #if defined(PETSC_H2OPUS_USE_GPU)
1898:       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1899:       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1900:       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1901:       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1902:   #endif
1903:     }
1904:     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1905:     a->orthogonal = PETSC_FALSE;
1906:   }
1907:   PetscFunctionReturn(PETSC_SUCCESS);
1908: }
1909: #endif