Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4:   #include <h2opus.h>
  5:   #if defined(H2OPUS_USE_MPI)
  6:     #include <h2opus/distributed/distributed_h2opus_handle.h>
  7:     #include <h2opus/distributed/distributed_geometric_construction.h>
  8:     #include <h2opus/distributed/distributed_hgemv.h>
  9:     #include <h2opus/distributed/distributed_horthog.h>
 10:     #include <h2opus/distributed/distributed_hcompress.h>
 11:   #endif
 12:   #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
 20: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
 21: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
 22: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);

 24:   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 26:   /* Use GPU only if H2OPUS is configured for GPU */
 27:   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 28:     #define PETSC_H2OPUS_USE_GPU
 29:   #endif
 30:   #if defined(PETSC_H2OPUS_USE_GPU)
 31:     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
 32:   #else
 33:     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
 34:   #endif

 36: // TODO H2OPUS:
 37: // DistributedHMatrix
 38: //   unsymmetric ?
 39: //   transpose for distributed_hgemv?
 40: //   clearData()
 41: // Unify interface for sequential and parallel?
 42: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 43: //
 44: template <class T>
 45: class PetscPointCloud : public H2OpusDataSet<T> {
 46: private:
 47:   int            dimension;
 48:   size_t         num_points;
 49:   std::vector<T> pts;

 51: public:
 52:   PetscPointCloud(int dim, size_t num_pts, const T coords[])
 53:   {
 54:     dim              = dim > 0 ? dim : 1;
 55:     this->dimension  = dim;
 56:     this->num_points = num_pts;

 58:     pts.resize(num_pts * dim);
 59:     if (coords) {
 60:       for (size_t n = 0; n < num_pts; n++)
 61:         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
 62:     } else {
 63:       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
 64:       for (size_t n = 0; n < num_pts; n++) {
 65:         pts[n * dim] = n * h;
 66:         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
 67:       }
 68:     }
 69:   }

 71:   PetscPointCloud(const PetscPointCloud<T> &other)
 72:   {
 73:     size_t N         = other.dimension * other.num_points;
 74:     this->dimension  = other.dimension;
 75:     this->num_points = other.num_points;
 76:     this->pts.resize(N);
 77:     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
 78:   }

 80:   int getDimension() const { return dimension; }

 82:   size_t getDataSetSize() const { return num_points; }

 84:   T getDataPoint(size_t idx, int dim) const
 85:   {
 86:     assert(dim < dimension && idx < num_points);
 87:     return pts[idx * dimension + dim];
 88:   }

 90:   void Print(std::ostream &out = std::cout)
 91:   {
 92:     out << "Dimension: " << dimension << std::endl;
 93:     out << "NumPoints: " << num_points << std::endl;
 94:     for (size_t n = 0; n < num_points; n++) {
 95:       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
 96:       out << std::endl;
 97:     }
 98:   }
 99: };

101: template <class T>
102: class PetscFunctionGenerator {
103: private:
104:   MatH2OpusKernelFn *k;
105:   int                dim;
106:   void              *ctx;

108: public:
109:   PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, void *ctx)
110:   {
111:     this->k   = k;
112:     this->dim = dim;
113:     this->ctx = ctx;
114:   }
115:   PetscFunctionGenerator(PetscFunctionGenerator &other)
116:   {
117:     this->k   = other.k;
118:     this->dim = other.dim;
119:     this->ctx = other.ctx;
120:   }
121:   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
122: };

124: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

126:   /* just to not clutter the code */
127:   #if !defined(H2OPUS_USE_GPU)
128: typedef HMatrix HMatrix_GPU;
129:     #if defined(H2OPUS_USE_MPI)
130: typedef DistributedHMatrix DistributedHMatrix_GPU;
131:     #endif
132:   #endif

134: typedef struct {
135:   #if defined(H2OPUS_USE_MPI)
136:   distributedH2OpusHandle_t handle;
137:   #else
138:   h2opusHandle_t handle;
139:   #endif
140:   /* Sequential and parallel matrices are two different classes at the moment */
141:   HMatrix *hmatrix;
142:   #if defined(H2OPUS_USE_MPI)
143:   DistributedHMatrix *dist_hmatrix;
144:   #else
145:   HMatrix *dist_hmatrix; /* just to not clutter the code */
146:   #endif
147:   /* May use permutations */
148:   PetscSF                           sf;
149:   PetscLayout                       h2opus_rmap, h2opus_cmap;
150:   IS                                h2opus_indexmap;
151:   thrust::host_vector<PetscScalar> *xx, *yy;
152:   PetscInt                          xxs, yys;
153:   PetscBool                         multsetup;

155:   /* GPU */
156:   HMatrix_GPU *hmatrix_gpu;
157:   #if defined(H2OPUS_USE_MPI)
158:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
159:   #else
160:   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161:   #endif
162:   #if defined(PETSC_H2OPUS_USE_GPU)
163:   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
164:   PetscInt                            xxs_gpu, yys_gpu;
165:   #endif

167:   /* construction from matvecs */
168:   PetscMatrixSampler *sampler;
169:   PetscBool           nativemult;

171:   /* Admissibility */
172:   PetscReal eta;
173:   PetscInt  leafsize;

175:   /* for dof reordering */
176:   PetscPointCloud<PetscReal> *ptcloud;

178:   /* kernel for generating matrix entries */
179:   PetscFunctionGenerator<PetscScalar> *kernel;

181:   /* basis orthogonalized? */
182:   PetscBool orthogonal;

184:   /* customization */
185:   PetscInt  basisord;
186:   PetscInt  max_rank;
187:   PetscInt  bs;
188:   PetscReal rtol;
189:   PetscInt  norm_max_samples;
190:   PetscBool check_construction;
191:   PetscBool hara_verbose;
192:   PetscBool resize;

194:   /* keeps track of MatScale values */
195:   PetscScalar s;
196: } Mat_H2OPUS;

198: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
199: {
200:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

202:   PetscFunctionBegin;
203:   #if defined(H2OPUS_USE_MPI)
204:   h2opusDestroyDistributedHandle(a->handle);
205:   #else
206:   h2opusDestroyHandle(a->handle);
207:   #endif
208:   delete a->dist_hmatrix;
209:   delete a->hmatrix;
210:   PetscCall(PetscSFDestroy(&a->sf));
211:   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
212:   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
213:   PetscCall(ISDestroy(&a->h2opus_indexmap));
214:   delete a->xx;
215:   delete a->yy;
216:   delete a->hmatrix_gpu;
217:   delete a->dist_hmatrix_gpu;
218:   #if defined(PETSC_H2OPUS_USE_GPU)
219:   delete a->xx_gpu;
220:   delete a->yy_gpu;
221:   #endif
222:   delete a->sampler;
223:   delete a->ptcloud;
224:   delete a->kernel;
225:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
226:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
227:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
228:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
229:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
230:   PetscCall(PetscFree(A->data));
231:   PetscFunctionReturn(PETSC_SUCCESS);
232: }

234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237:   PetscBool   ish2opus;

239:   PetscFunctionBegin;
242:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
243:   if (ish2opus) {
244:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246:         PetscLayout t;
247:         t              = A->rmap;
248:         A->rmap        = a->h2opus_rmap;
249:         a->h2opus_rmap = t;
250:         t              = A->cmap;
251:         A->cmap        = a->h2opus_cmap;
252:         a->h2opus_cmap = t;
253:       }
254:     }
255:     a->nativemult = nm;
256:   }
257:   PetscFunctionReturn(PETSC_SUCCESS);
258: }

260: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261: {
262:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
263:   PetscBool   ish2opus;

265:   PetscFunctionBegin;
267:   PetscAssertPointer(nm, 2);
268:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
269:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
270:   *nm = a->nativemult;
271:   PetscFunctionReturn(PETSC_SUCCESS);
272: }

274: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
275: {
276:   PetscBool   ish2opus;
277:   PetscInt    nmax = PETSC_DECIDE;
278:   Mat_H2OPUS *a    = NULL;
279:   PetscBool   mult = PETSC_FALSE;

281:   PetscFunctionBegin;
282:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
283:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
284:     a = (Mat_H2OPUS *)A->data;

286:     nmax = a->norm_max_samples;
287:     mult = a->nativemult;
288:     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
289:   } else {
290:     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
291:   }
292:   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
293:   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
294:   PetscFunctionReturn(PETSC_SUCCESS);
295: }

297: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
298: {
299:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
300:   PetscInt    n;
301:   PetscBool   boundtocpu = PETSC_TRUE;

303:   PetscFunctionBegin;
304:   #if defined(PETSC_H2OPUS_USE_GPU)
305:   boundtocpu = A->boundtocpu;
306:   #endif
307:   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
308:   if (boundtocpu) {
309:     if (h2opus->xxs < xN) {
310:       h2opus->xx->resize(n * xN);
311:       h2opus->xxs = xN;
312:     }
313:     if (h2opus->yys < yN) {
314:       h2opus->yy->resize(n * yN);
315:       h2opus->yys = yN;
316:     }
317:   }
318:   #if defined(PETSC_H2OPUS_USE_GPU)
319:   if (!boundtocpu) {
320:     if (h2opus->xxs_gpu < xN) {
321:       h2opus->xx_gpu->resize(n * xN);
322:       h2opus->xxs_gpu = xN;
323:     }
324:     if (h2opus->yys_gpu < yN) {
325:       h2opus->yy_gpu->resize(n * yN);
326:       h2opus->yys_gpu = yN;
327:     }
328:   }
329:   #endif
330:   PetscFunctionReturn(PETSC_SUCCESS);
331: }

333: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
334: {
335:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
336:   #if defined(H2OPUS_USE_MPI)
337:   h2opusHandle_t handle = h2opus->handle->handle;
338:   #else
339:   h2opusHandle_t handle = h2opus->handle;
340:   #endif
341:   PetscBool    boundtocpu = PETSC_TRUE;
342:   PetscScalar *xx, *yy, *uxx, *uyy;
343:   PetscInt     blda, clda;
344:   PetscMPIInt  size;
345:   PetscSF      bsf, csf;
346:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

348:   PetscFunctionBegin;
349:   HLibProfile::clear();
350:   #if defined(PETSC_H2OPUS_USE_GPU)
351:   boundtocpu = A->boundtocpu;
352:   #endif
353:   PetscCall(MatDenseGetLDA(B, &blda));
354:   PetscCall(MatDenseGetLDA(C, &clda));
355:   if (usesf) {
356:     PetscInt n;

358:     PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
359:     PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));

361:     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
362:     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
363:     blda = n;
364:     clda = n;
365:   }
366:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
367:   if (boundtocpu) {
368:     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
369:     PetscCall(MatDenseGetArrayWrite(C, &yy));
370:     if (usesf) {
371:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
372:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
373:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375:     } else {
376:       uxx = xx;
377:       uyy = yy;
378:     }
379:     if (size > 1) {
380:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
381:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
382:   #if defined(H2OPUS_USE_MPI)
383:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
384:   #endif
385:     } else {
386:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
387:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
388:     }
389:     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
390:     if (usesf) {
391:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393:     }
394:     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
395:   #if defined(PETSC_H2OPUS_USE_GPU)
396:   } else {
397:     PetscBool ciscuda, biscuda;

399:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
400:     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
401:     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
402:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
403:     if (!ciscuda) {
404:       C->assembled = PETSC_TRUE;
405:       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
406:     }
407:     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
408:     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
409:     if (usesf) {
410:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
411:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
412:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414:     } else {
415:       uxx = xx;
416:       uyy = yy;
417:     }
418:     PetscCall(PetscLogGpuTimeBegin());
419:     if (size > 1) {
420:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
421:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
422:     #if defined(H2OPUS_USE_MPI)
423:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
424:     #endif
425:     } else {
426:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
427:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
428:     }
429:     PetscCall(PetscLogGpuTimeEnd());
430:     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
431:     if (usesf) {
432:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434:     }
435:     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
436:     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
437:     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
438:   #endif
439:   }
440:   { /* log flops */
441:     double gops, time, perf, dev;
442:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
443:   #if defined(PETSC_H2OPUS_USE_GPU)
444:     if (boundtocpu) {
445:       PetscCall(PetscLogFlops(1e9 * gops));
446:     } else {
447:       PetscCall(PetscLogGpuFlops(1e9 * gops));
448:     }
449:   #else
450:     PetscCall(PetscLogFlops(1e9 * gops));
451:   #endif
452:   }
453:   PetscFunctionReturn(PETSC_SUCCESS);
454: }

456: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
457: {
458:   Mat_Product *product = C->product;

460:   PetscFunctionBegin;
461:   MatCheckProduct(C, 1);
462:   switch (product->type) {
463:   case MATPRODUCT_AB:
464:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
465:     break;
466:   case MATPRODUCT_AtB:
467:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
468:     break;
469:   default:
470:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
471:   }
472:   PetscFunctionReturn(PETSC_SUCCESS);
473: }

475: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
476: {
477:   Mat_Product *product = C->product;
478:   PetscBool    cisdense;
479:   Mat          A, B;

481:   PetscFunctionBegin;
482:   MatCheckProduct(C, 1);
483:   A = product->A;
484:   B = product->B;
485:   switch (product->type) {
486:   case MATPRODUCT_AB:
487:     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
488:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
489:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
490:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
491:     PetscCall(MatSetUp(C));
492:     break;
493:   case MATPRODUCT_AtB:
494:     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
495:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
496:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
497:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
498:     PetscCall(MatSetUp(C));
499:     break;
500:   default:
501:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
502:   }
503:   C->ops->productsymbolic = NULL;
504:   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
505:   PetscFunctionReturn(PETSC_SUCCESS);
506: }

508: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
509: {
510:   PetscFunctionBegin;
511:   MatCheckProduct(C, 1);
512:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
513:   PetscFunctionReturn(PETSC_SUCCESS);
514: }

516: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517: {
518:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
519:   #if defined(H2OPUS_USE_MPI)
520:   h2opusHandle_t handle = h2opus->handle->handle;
521:   #else
522:   h2opusHandle_t handle = h2opus->handle;
523:   #endif
524:   PetscBool    boundtocpu = PETSC_TRUE;
525:   PetscInt     n;
526:   PetscScalar *xx, *yy, *uxx, *uyy;
527:   PetscMPIInt  size;
528:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

530:   PetscFunctionBegin;
531:   HLibProfile::clear();
532:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
533:   #if defined(PETSC_H2OPUS_USE_GPU)
534:   boundtocpu = A->boundtocpu;
535:   #endif
536:   if (usesf) {
537:     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
538:   } else n = A->rmap->n;
539:   if (boundtocpu) {
540:     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
541:     if (sy == 0.0) {
542:       PetscCall(VecGetArrayWrite(y, &yy));
543:     } else {
544:       PetscCall(VecGetArray(y, &yy));
545:     }
546:     if (usesf) {
547:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

550:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
551:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552:       if (sy != 0.0) {
553:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
554:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555:       }
556:     } else {
557:       uxx = xx;
558:       uyy = yy;
559:     }
560:     if (size > 1) {
561:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
562:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
563:   #if defined(H2OPUS_USE_MPI)
564:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565:   #endif
566:     } else {
567:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
568:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569:     }
570:     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
571:     if (usesf) {
572:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
573:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574:     }
575:     if (sy == 0.0) {
576:       PetscCall(VecRestoreArrayWrite(y, &yy));
577:     } else {
578:       PetscCall(VecRestoreArray(y, &yy));
579:     }
580:   #if defined(PETSC_H2OPUS_USE_GPU)
581:   } else {
582:     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
583:     if (sy == 0.0) {
584:       PetscCall(VecCUDAGetArrayWrite(y, &yy));
585:     } else {
586:       PetscCall(VecCUDAGetArray(y, &yy));
587:     }
588:     if (usesf) {
589:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

592:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
593:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594:       if (sy != 0.0) {
595:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
596:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597:       }
598:     } else {
599:       uxx = xx;
600:       uyy = yy;
601:     }
602:     PetscCall(PetscLogGpuTimeBegin());
603:     if (size > 1) {
604:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
605:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
606:     #if defined(H2OPUS_USE_MPI)
607:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608:     #endif
609:     } else {
610:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
611:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612:     }
613:     PetscCall(PetscLogGpuTimeEnd());
614:     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
615:     if (usesf) {
616:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
617:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618:     }
619:     if (sy == 0.0) {
620:       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
621:     } else {
622:       PetscCall(VecCUDARestoreArray(y, &yy));
623:     }
624:   #endif
625:   }
626:   { /* log flops */
627:     double gops, time, perf, dev;
628:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
629:   #if defined(PETSC_H2OPUS_USE_GPU)
630:     if (boundtocpu) {
631:       PetscCall(PetscLogFlops(1e9 * gops));
632:     } else {
633:       PetscCall(PetscLogGpuFlops(1e9 * gops));
634:     }
635:   #else
636:     PetscCall(PetscLogFlops(1e9 * gops));
637:   #endif
638:   }
639:   PetscFunctionReturn(PETSC_SUCCESS);
640: }

642: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643: {
644:   PetscBool xiscuda, yiscuda;

646:   PetscFunctionBegin;
647:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
648:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
649:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
650:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
651:   PetscFunctionReturn(PETSC_SUCCESS);
652: }

654: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655: {
656:   PetscBool xiscuda, yiscuda;

658:   PetscFunctionBegin;
659:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
660:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
661:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
662:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
663:   PetscFunctionReturn(PETSC_SUCCESS);
664: }

666: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667: {
668:   PetscBool xiscuda, ziscuda;

670:   PetscFunctionBegin;
671:   PetscCall(VecCopy(y, z));
672:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
673:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
674:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
675:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
676:   PetscFunctionReturn(PETSC_SUCCESS);
677: }

679: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680: {
681:   PetscBool xiscuda, ziscuda;

683:   PetscFunctionBegin;
684:   PetscCall(VecCopy(y, z));
685:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
686:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
687:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
688:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
689:   PetscFunctionReturn(PETSC_SUCCESS);
690: }

692: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693: {
694:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

696:   PetscFunctionBegin;
697:   a->s *= s;
698:   PetscFunctionReturn(PETSC_SUCCESS);
699: }

701: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
702: {
703:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

705:   PetscFunctionBegin;
706:   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
707:   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
708:   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
709:   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
710:   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
711:   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
712:   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
713:   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
714:   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
715:   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
716:   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
717:   PetscOptionsHeadEnd();
718:   PetscFunctionReturn(PETSC_SUCCESS);
719: }

721: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);

723: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724: {
725:   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
726:   Vec                c;
727:   PetscInt           spacedim;
728:   const PetscScalar *coords;

730:   PetscFunctionBegin;
731:   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
732:   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
733:   if (!c && a->sampler) {
734:     Mat S = a->sampler->GetSamplingMat();

736:     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
737:   }
738:   if (!c) {
739:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
740:   } else {
741:     PetscCall(VecGetArrayRead(c, &coords));
742:     PetscCall(VecGetBlockSize(c, &spacedim));
743:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
744:     PetscCall(VecRestoreArrayRead(c, &coords));
745:   }
746:   PetscFunctionReturn(PETSC_SUCCESS);
747: }

749: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750: {
751:   MPI_Comm      comm;
752:   PetscMPIInt   size;
753:   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
754:   PetscInt      n = 0, *idx = NULL;
755:   int          *iidx = NULL;
756:   PetscCopyMode own;
757:   PetscBool     rid;

759:   PetscFunctionBegin;
760:   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
761:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762:     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
763:   #if defined(PETSC_H2OPUS_USE_GPU)
764:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
765:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
766:     a->xxs_gpu = 1;
767:     a->yys_gpu = 1;
768:   #endif
769:     a->xx  = new thrust::host_vector<PetscScalar>(n);
770:     a->yy  = new thrust::host_vector<PetscScalar>(n);
771:     a->xxs = 1;
772:     a->yys = 1;
773:   } else {
774:     IS is;
775:     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
776:     PetscCallMPI(MPI_Comm_size(comm, &size));
777:     if (!a->h2opus_indexmap) {
778:       if (size > 1) {
779:         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
780:   #if defined(H2OPUS_USE_MPI)
781:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783:   #endif
784:       } else {
785:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786:         n    = a->hmatrix->u_basis_tree.index_map.size();
787:       }

789:       if (PetscDefined(USE_64BIT_INDICES)) {
790:         PetscInt i;

792:         own = PETSC_OWN_POINTER;
793:         PetscCall(PetscMalloc1(n, &idx));
794:         for (i = 0; i < n; i++) idx[i] = iidx[i];
795:       } else {
796:         own = PETSC_COPY_VALUES;
797:         idx = (PetscInt *)iidx;
798:       }
799:       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
800:       PetscCall(ISSetPermutation(is));
801:       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
802:       a->h2opus_indexmap = is;
803:     }
804:     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
805:     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
806:     rid = (PetscBool)(n == A->rmap->n);
807:     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
808:     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
809:     if (!rid) {
810:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
811:         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
812:         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
813:         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
814:         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
815:       }
816:       PetscCall(PetscSFCreate(comm, &a->sf));
817:       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
818:       PetscCall(PetscSFSetUp(a->sf));
819:       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
820:   #if defined(PETSC_H2OPUS_USE_GPU)
821:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
822:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
823:       a->xxs_gpu = 1;
824:       a->yys_gpu = 1;
825:   #endif
826:       a->xx  = new thrust::host_vector<PetscScalar>(n);
827:       a->yy  = new thrust::host_vector<PetscScalar>(n);
828:       a->xxs = 1;
829:       a->yys = 1;
830:     }
831:     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
832:   }
833:   a->multsetup = PETSC_TRUE;
834:   PetscFunctionReturn(PETSC_SUCCESS);
835: }

837: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
838: {
839:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
840:   #if defined(H2OPUS_USE_MPI)
841:   h2opusHandle_t handle = a->handle->handle;
842:   #else
843:   h2opusHandle_t handle = a->handle;
844:   #endif
845:   PetscBool   kernel       = PETSC_FALSE;
846:   PetscBool   boundtocpu   = PETSC_TRUE;
847:   PetscBool   samplingdone = PETSC_FALSE;
848:   MPI_Comm    comm;
849:   PetscMPIInt size;

851:   PetscFunctionBegin;
852:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
853:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
854:   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");

856:   /* XXX */
857:   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));

859:   PetscCallMPI(MPI_Comm_size(comm, &size));
860:   /* TODO REUSABILITY of geometric construction */
861:   delete a->hmatrix;
862:   delete a->dist_hmatrix;
863:   #if defined(PETSC_H2OPUS_USE_GPU)
864:   delete a->hmatrix_gpu;
865:   delete a->dist_hmatrix_gpu;
866:   #endif
867:   a->orthogonal = PETSC_FALSE;

869:   /* TODO: other? */
870:   H2OpusBoxCenterAdmissibility adm(a->eta);

872:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
873:   if (size > 1) {
874:   #if defined(H2OPUS_USE_MPI)
875:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
876:   #else
877:     a->dist_hmatrix = NULL;
878:   #endif
879:   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
880:   PetscCall(MatH2OpusInferCoordinates_Private(A));
881:   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
882:   if (a->kernel) {
883:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
884:     if (size > 1) {
885:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
886:   #if defined(H2OPUS_USE_MPI)
887:       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
888:   #endif
889:     } else {
890:       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
891:     }
892:     kernel = PETSC_TRUE;
893:   } else {
894:     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
895:     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
896:   }
897:   PetscCall(MatSetUpMultiply_H2OPUS(A));

899:   #if defined(PETSC_H2OPUS_USE_GPU)
900:   boundtocpu = A->boundtocpu;
901:   if (!boundtocpu) {
902:     if (size > 1) {
903:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
904:     #if defined(H2OPUS_USE_MPI)
905:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
906:     #endif
907:     } else {
908:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
909:     }
910:   }
911:   #endif
912:   if (size == 1) {
913:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
914:       PetscReal Anorm;
915:       bool      verbose;

917:       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
918:       verbose = a->hara_verbose;
919:       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
920:       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
921:       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
922:       a->sampler->SetStream(handle->getMainStream());
923:       if (boundtocpu) {
924:         a->sampler->SetGPUSampling(false);
925:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
926:   #if defined(PETSC_H2OPUS_USE_GPU)
927:       } else {
928:         a->sampler->SetGPUSampling(true);
929:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
930:   #endif
931:       }
932:       samplingdone = PETSC_TRUE;
933:     }
934:   }
935:   #if defined(PETSC_H2OPUS_USE_GPU)
936:   if (!boundtocpu) {
937:     delete a->hmatrix;
938:     delete a->dist_hmatrix;
939:     a->hmatrix      = NULL;
940:     a->dist_hmatrix = NULL;
941:   }
942:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
943:   #endif
944:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));

946:   if (!a->s) a->s = 1.0;
947:   A->assembled = PETSC_TRUE;

949:   if (samplingdone) {
950:     PetscBool check  = a->check_construction;
951:     PetscBool checke = PETSC_FALSE;

953:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
954:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
955:     if (check) {
956:       Mat       E, Ae;
957:       PetscReal n1, ni, n2;
958:       PetscReal n1A, niA, n2A;
959:       void (*normfunc)(void);

961:       Ae = a->sampler->GetSamplingMat();
962:       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
963:       PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
964:       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
965:       PetscCall(MatNorm(E, NORM_1, &n1));
966:       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
967:       PetscCall(MatNorm(E, NORM_2, &n2));
968:       if (checke) {
969:         Mat eA, eE, eAe;

971:         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
972:         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
973:         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
974:         PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
975:         PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976:         PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
977:         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
978:         PetscCall(MatView(eA, NULL));
979:         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
980:         PetscCall(MatView(eAe, NULL));
981:         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
982:         PetscCall(MatView(eE, NULL));
983:         PetscCall(MatDestroy(&eA));
984:         PetscCall(MatDestroy(&eE));
985:         PetscCall(MatDestroy(&eAe));
986:       }

988:       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
989:       PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
990:       PetscCall(MatNorm(Ae, NORM_1, &n1A));
991:       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
992:       PetscCall(MatNorm(Ae, NORM_2, &n2A));
993:       n1A = PetscMax(n1A, PETSC_SMALL);
994:       n2A = PetscMax(n2A, PETSC_SMALL);
995:       niA = PetscMax(niA, PETSC_SMALL);
996:       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
997:       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
998:       PetscCall(MatDestroy(&E));
999:     }
1000:     a->sampler->SetSamplingMat(NULL);
1001:   }
1002:   PetscFunctionReturn(PETSC_SUCCESS);
1003: }

1005: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1006: {
1007:   PetscMPIInt size;
1008:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1010:   PetscFunctionBegin;
1011:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1012:   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1013:   a->hmatrix->clearData();
1014:   #if defined(PETSC_H2OPUS_USE_GPU)
1015:   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1016:   #endif
1017:   PetscFunctionReturn(PETSC_SUCCESS);
1018: }

1020: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1021: {
1022:   Mat         A;
1023:   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1024:   #if defined(PETSC_H2OPUS_USE_GPU)
1025:   PetscBool iscpu = PETSC_FALSE;
1026:   #else
1027:   PetscBool iscpu = PETSC_TRUE;
1028:   #endif
1029:   MPI_Comm comm;

1031:   PetscFunctionBegin;
1032:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1033:   PetscCall(MatCreate(comm, &A));
1034:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1035:   PetscCall(MatSetType(A, MATH2OPUS));
1036:   PetscCall(MatPropagateSymmetryOptions(B, A));
1037:   a = (Mat_H2OPUS *)A->data;

1039:   a->eta              = b->eta;
1040:   a->leafsize         = b->leafsize;
1041:   a->basisord         = b->basisord;
1042:   a->max_rank         = b->max_rank;
1043:   a->bs               = b->bs;
1044:   a->rtol             = b->rtol;
1045:   a->norm_max_samples = b->norm_max_samples;
1046:   if (op == MAT_COPY_VALUES) a->s = b->s;

1048:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1049:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1051:   #if defined(H2OPUS_USE_MPI)
1052:   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1053:     #if defined(PETSC_H2OPUS_USE_GPU)
1054:   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1055:     #endif
1056:   #endif
1057:   if (b->hmatrix) {
1058:     a->hmatrix = new HMatrix(*b->hmatrix);
1059:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1060:   }
1061:   #if defined(PETSC_H2OPUS_USE_GPU)
1062:   if (b->hmatrix_gpu) {
1063:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1064:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1065:   }
1066:   #endif
1067:   if (b->sf) {
1068:     PetscCall(PetscObjectReference((PetscObject)b->sf));
1069:     a->sf = b->sf;
1070:   }
1071:   if (b->h2opus_indexmap) {
1072:     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1073:     a->h2opus_indexmap = b->h2opus_indexmap;
1074:   }

1076:   PetscCall(MatSetUp(A));
1077:   PetscCall(MatSetUpMultiply_H2OPUS(A));
1078:   if (op == MAT_COPY_VALUES) {
1079:     A->assembled  = PETSC_TRUE;
1080:     a->orthogonal = b->orthogonal;
1081:   #if defined(PETSC_H2OPUS_USE_GPU)
1082:     A->offloadmask = B->offloadmask;
1083:   #endif
1084:   }
1085:   #if defined(PETSC_H2OPUS_USE_GPU)
1086:   iscpu = B->boundtocpu;
1087:   #endif
1088:   PetscCall(MatBindToCPU(A, iscpu));

1090:   *nA = A;
1091:   PetscFunctionReturn(PETSC_SUCCESS);
1092: }

1094: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1095: {
1096:   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1097:   PetscBool         isascii, vieweps;
1098:   PetscMPIInt       size;
1099:   PetscViewerFormat format;

1101:   PetscFunctionBegin;
1102:   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1103:   PetscCall(PetscViewerGetFormat(view, &format));
1104:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1105:   if (isascii) {
1106:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1107:       if (size == 1) {
1108:         FILE *fp;
1109:         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1110:         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1111:       }
1112:     } else {
1113:       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1114:       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1115:       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1116:       if (!h2opus->kernel) {
1117:         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1118:       } else {
1119:         PetscCall(PetscViewerASCIIPrintf(view, "  Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1120:       }
1121:       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1122:       if (size == 1) {
1123:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1124:         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1125:   #if defined(PETSC_HAVE_CUDA)
1126:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1127:         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1128:   #endif
1129:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1130:   #if defined(PETSC_HAVE_CUDA)
1131:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1132:   #endif
1133:       } else {
1134:   #if defined(PETSC_HAVE_CUDA)
1135:         double      matrix_mem[4] = {0., 0., 0., 0.};
1136:         PetscMPIInt rsize         = 4;
1137:   #else
1138:         double      matrix_mem[2] = {0., 0.};
1139:         PetscMPIInt rsize         = 2;
1140:   #endif
1141:   #if defined(H2OPUS_USE_MPI)
1142:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1143:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1144:     #if defined(PETSC_HAVE_CUDA)
1145:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1146:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1147:     #endif
1148:   #endif
1149:         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1150:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1151:   #if defined(PETSC_HAVE_CUDA)
1152:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1153:   #endif
1154:       }
1155:     }
1156:   }
1157:   vieweps = PETSC_FALSE;
1158:   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1159:   if (vieweps) {
1160:     char        filename[256];
1161:     const char *name;

1163:     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1164:     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1165:     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1166:     outputEps(*h2opus->hmatrix, filename);
1167:   }
1168:   PetscFunctionReturn(PETSC_SUCCESS);
1169: }

1171: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1172: {
1173:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1174:   PetscReal  *gcoords;
1175:   PetscInt    N;
1176:   MPI_Comm    comm;
1177:   PetscMPIInt size;
1178:   PetscBool   cong;

1180:   PetscFunctionBegin;
1181:   PetscCall(PetscLayoutSetUp(A->rmap));
1182:   PetscCall(PetscLayoutSetUp(A->cmap));
1183:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1184:   PetscCall(MatHasCongruentLayouts(A, &cong));
1185:   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1186:   N = A->rmap->N;
1187:   PetscCallMPI(MPI_Comm_size(comm, &size));
1188:   if (spacedim > 0 && size > 1 && cdist) {
1189:     PetscSF      sf;
1190:     MPI_Datatype dtype;

1192:     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1193:     PetscCallMPI(MPI_Type_commit(&dtype));

1195:     PetscCall(PetscSFCreate(comm, &sf));
1196:     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1197:     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1198:     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1199:     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1200:     PetscCall(PetscSFDestroy(&sf));
1201:     PetscCallMPI(MPI_Type_free(&dtype));
1202:   } else gcoords = (PetscReal *)coords;

1204:   delete h2opus->ptcloud;
1205:   delete h2opus->kernel;
1206:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1207:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1208:   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1209:   A->preallocated = PETSC_TRUE;
1210:   PetscFunctionReturn(PETSC_SUCCESS);
1211: }

1213:   #if defined(PETSC_H2OPUS_USE_GPU)
1214: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1215: {
1216:   PetscMPIInt size;
1217:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1219:   PetscFunctionBegin;
1220:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1221:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1222:     if (size > 1) {
1223:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1224:     #if defined(H2OPUS_USE_MPI)
1225:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1226:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1227:     #endif
1228:     } else {
1229:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1230:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1231:       else *a->hmatrix = *a->hmatrix_gpu;
1232:     }
1233:     delete a->hmatrix_gpu;
1234:     delete a->dist_hmatrix_gpu;
1235:     a->hmatrix_gpu      = NULL;
1236:     a->dist_hmatrix_gpu = NULL;
1237:     A->offloadmask      = PETSC_OFFLOAD_CPU;
1238:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1239:     if (size > 1) {
1240:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1241:     #if defined(H2OPUS_USE_MPI)
1242:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1243:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1244:     #endif
1245:     } else {
1246:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1247:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1248:       else *a->hmatrix_gpu = *a->hmatrix;
1249:     }
1250:     delete a->hmatrix;
1251:     delete a->dist_hmatrix;
1252:     a->hmatrix      = NULL;
1253:     a->dist_hmatrix = NULL;
1254:     A->offloadmask  = PETSC_OFFLOAD_GPU;
1255:   }
1256:   PetscCall(PetscFree(A->defaultvectype));
1257:   if (!flg) {
1258:     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1259:   } else {
1260:     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1261:   }
1262:   A->boundtocpu = flg;
1263:   PetscFunctionReturn(PETSC_SUCCESS);
1264: }
1265:   #endif

1267: /*MC
1268:    MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.

1270:    Options Database Key:
1271: .  -mat_type h2opus - matrix type to "h2opus"

1273:    Level: beginner

1275:    Notes:
1276:    H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.

1278:    For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1279:    In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.

1281: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1282: M*/
1283: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1284: {
1285:   Mat_H2OPUS *a;
1286:   PetscMPIInt size;

1288:   PetscFunctionBegin;
1289:   #if defined(PETSC_H2OPUS_USE_GPU)
1290:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1291:   #endif
1292:   PetscCall(PetscNew(&a));
1293:   A->data = (void *)a;

1295:   a->eta              = 0.9;
1296:   a->leafsize         = 32;
1297:   a->basisord         = 4;
1298:   a->max_rank         = 64;
1299:   a->bs               = 32;
1300:   a->rtol             = 1.e-4;
1301:   a->s                = 1.0;
1302:   a->norm_max_samples = 10;
1303:   a->resize           = PETSC_TRUE; /* reallocate after compression */
1304:   #if defined(H2OPUS_USE_MPI)
1305:   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1306:   #else
1307:   h2opusCreateHandle(&a->handle);
1308:   #endif
1309:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1310:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1311:   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));

1313:   A->ops->destroy          = MatDestroy_H2OPUS;
1314:   A->ops->view             = MatView_H2OPUS;
1315:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1316:   A->ops->mult             = MatMult_H2OPUS;
1317:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1318:   A->ops->multadd          = MatMultAdd_H2OPUS;
1319:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1320:   A->ops->scale            = MatScale_H2OPUS;
1321:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1322:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1323:   A->ops->norm             = MatNorm_H2OPUS;
1324:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1325:   #if defined(PETSC_H2OPUS_USE_GPU)
1326:   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1327:   #endif

1329:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1330:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1331:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1332:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1333:   #if defined(PETSC_H2OPUS_USE_GPU)
1334:   PetscCall(PetscFree(A->defaultvectype));
1335:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1336:   #endif
1337:   PetscFunctionReturn(PETSC_SUCCESS);
1338: }

1340: /*@C
1341:   MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1343:   Input Parameter:
1344: . A - the matrix

1346:   Level: intermediate

1348: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1349: @*/
1350: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1351: {
1352:   PetscBool   ish2opus;
1353:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1354:   PetscMPIInt size;
1355:   PetscBool   boundtocpu = PETSC_TRUE;

1357:   PetscFunctionBegin;
1360:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1361:   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1362:   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1363:   HLibProfile::clear();
1364:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1365:   #if defined(PETSC_H2OPUS_USE_GPU)
1366:   boundtocpu = A->boundtocpu;
1367:   #endif
1368:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1369:   if (size > 1) {
1370:     if (boundtocpu) {
1371:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1372:   #if defined(H2OPUS_USE_MPI)
1373:       distributed_horthog(*a->dist_hmatrix, a->handle);
1374:   #endif
1375:   #if defined(PETSC_H2OPUS_USE_GPU)
1376:       A->offloadmask = PETSC_OFFLOAD_CPU;
1377:     } else {
1378:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1379:       PetscCall(PetscLogGpuTimeBegin());
1380:     #if defined(H2OPUS_USE_MPI)
1381:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1382:     #endif
1383:       PetscCall(PetscLogGpuTimeEnd());
1384:   #endif
1385:     }
1386:   } else {
1387:   #if defined(H2OPUS_USE_MPI)
1388:     h2opusHandle_t handle = a->handle->handle;
1389:   #else
1390:     h2opusHandle_t handle = a->handle;
1391:   #endif
1392:     if (boundtocpu) {
1393:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1394:       horthog(*a->hmatrix, handle);
1395:   #if defined(PETSC_H2OPUS_USE_GPU)
1396:       A->offloadmask = PETSC_OFFLOAD_CPU;
1397:     } else {
1398:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1399:       PetscCall(PetscLogGpuTimeBegin());
1400:       horthog(*a->hmatrix_gpu, handle);
1401:       PetscCall(PetscLogGpuTimeEnd());
1402:   #endif
1403:     }
1404:   }
1405:   a->orthogonal = PETSC_TRUE;
1406:   { /* log flops */
1407:     double gops, time, perf, dev;
1408:     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1409:   #if defined(PETSC_H2OPUS_USE_GPU)
1410:     if (boundtocpu) {
1411:       PetscCall(PetscLogFlops(1e9 * gops));
1412:     } else {
1413:       PetscCall(PetscLogGpuFlops(1e9 * gops));
1414:     }
1415:   #else
1416:     PetscCall(PetscLogFlops(1e9 * gops));
1417:   #endif
1418:   }
1419:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1420:   PetscFunctionReturn(PETSC_SUCCESS);
1421: }

1423: /*@C
1424:   MatH2OpusCompress - Compress a hierarchical matrix.

1426:   Input Parameters:
1427: + A   - the matrix
1428: - tol - the absolute truncation threshold

1430:   Level: intermediate

1432: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1433: @*/
1434: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1435: {
1436:   PetscBool   ish2opus;
1437:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1438:   PetscMPIInt size;
1439:   PetscBool   boundtocpu = PETSC_TRUE;

1441:   PetscFunctionBegin;
1445:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1446:   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1447:   PetscCall(MatH2OpusOrthogonalize(A));
1448:   HLibProfile::clear();
1449:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1450:   #if defined(PETSC_H2OPUS_USE_GPU)
1451:   boundtocpu = A->boundtocpu;
1452:   #endif
1453:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1454:   if (size > 1) {
1455:     if (boundtocpu) {
1456:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1457:   #if defined(H2OPUS_USE_MPI)
1458:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1459:       if (a->resize) {
1460:         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1461:         delete a->dist_hmatrix;
1462:         a->dist_hmatrix = dist_hmatrix;
1463:       }
1464:   #endif
1465:   #if defined(PETSC_H2OPUS_USE_GPU)
1466:       A->offloadmask = PETSC_OFFLOAD_CPU;
1467:     } else {
1468:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1469:       PetscCall(PetscLogGpuTimeBegin());
1470:     #if defined(H2OPUS_USE_MPI)
1471:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);

1473:       if (a->resize) {
1474:         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1475:         delete a->dist_hmatrix_gpu;
1476:         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1477:       }
1478:     #endif
1479:       PetscCall(PetscLogGpuTimeEnd());
1480:   #endif
1481:     }
1482:   } else {
1483:   #if defined(H2OPUS_USE_MPI)
1484:     h2opusHandle_t handle = a->handle->handle;
1485:   #else
1486:     h2opusHandle_t handle = a->handle;
1487:   #endif
1488:     if (boundtocpu) {
1489:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1490:       hcompress(*a->hmatrix, tol, handle);

1492:       if (a->resize) {
1493:         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1494:         delete a->hmatrix;
1495:         a->hmatrix = hmatrix;
1496:       }
1497:   #if defined(PETSC_H2OPUS_USE_GPU)
1498:       A->offloadmask = PETSC_OFFLOAD_CPU;
1499:     } else {
1500:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1501:       PetscCall(PetscLogGpuTimeBegin());
1502:       hcompress(*a->hmatrix_gpu, tol, handle);
1503:       PetscCall(PetscLogGpuTimeEnd());

1505:       if (a->resize) {
1506:         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1507:         delete a->hmatrix_gpu;
1508:         a->hmatrix_gpu = hmatrix_gpu;
1509:       }
1510:   #endif
1511:     }
1512:   }
1513:   { /* log flops */
1514:     double gops, time, perf, dev;
1515:     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1516:   #if defined(PETSC_H2OPUS_USE_GPU)
1517:     if (boundtocpu) {
1518:       PetscCall(PetscLogFlops(1e9 * gops));
1519:     } else {
1520:       PetscCall(PetscLogGpuFlops(1e9 * gops));
1521:     }
1522:   #else
1523:     PetscCall(PetscLogFlops(1e9 * gops));
1524:   #endif
1525:   }
1526:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1527:   PetscFunctionReturn(PETSC_SUCCESS);
1528: }

1530: /*@C
1531:   MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.

1533:   Input Parameters:
1534: + A   - the hierarchical matrix
1535: . B   - the matrix to be sampled
1536: . bs  - maximum number of samples to be taken concurrently
1537: - tol - relative tolerance for construction

1539:   Level: intermediate

1541:   Notes:
1542:   You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.

1544: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1545: @*/
1546: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1547: {
1548:   PetscBool ish2opus;

1550:   PetscFunctionBegin;
1556:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1557:   if (ish2opus) {
1558:     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1560:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1561:     a->sampler->SetSamplingMat(B);
1562:     if (bs > 0) a->bs = bs;
1563:     if (tol > 0.) a->rtol = tol;
1564:     delete a->kernel;
1565:   }
1566:   PetscFunctionReturn(PETSC_SUCCESS);
1567: }

1569: /*@C
1570:   MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.

1572:   Input Parameters:
1573: + comm      - MPI communicator
1574: . m         - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1575: . n         - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1576: . M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1577: . N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1578: . spacedim  - dimension of the space coordinates
1579: . coords    - coordinates of the points
1580: . cdist     - whether or not coordinates are distributed
1581: . kernel    - computational kernel (or `NULL`)
1582: . kernelctx - kernel context
1583: . eta       - admissibility condition tolerance
1584: . leafsize  - leaf size in cluster tree
1585: - basisord  - approximation order for Chebychev interpolation of low-rank blocks

1587:   Output Parameter:
1588: . nA - matrix

1590:   Options Database Keys:
1591: + -mat_h2opus_leafsize <`PetscInt`>    - Leaf size of cluster tree
1592: . -mat_h2opus_eta <`PetscReal`>        - Admissibility condition tolerance
1593: . -mat_h2opus_order <`PetscInt`>       - Chebychev approximation order
1594: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms

1596:   Level: intermediate

1598: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1599: @*/
1600: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1601: {
1602:   Mat         A;
1603:   Mat_H2OPUS *h2opus;
1604:   #if defined(PETSC_H2OPUS_USE_GPU)
1605:   PetscBool iscpu = PETSC_FALSE;
1606:   #else
1607:   PetscBool iscpu = PETSC_TRUE;
1608:   #endif

1610:   PetscFunctionBegin;
1611:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1612:   PetscCall(MatCreate(comm, &A));
1613:   PetscCall(MatSetSizes(A, m, n, M, N));
1614:   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1615:   PetscCall(MatSetType(A, MATH2OPUS));
1616:   PetscCall(MatBindToCPU(A, iscpu));
1617:   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));

1619:   h2opus = (Mat_H2OPUS *)A->data;
1620:   if (eta > 0.) h2opus->eta = eta;
1621:   if (leafsize > 0) h2opus->leafsize = leafsize;
1622:   if (basisord > 0) h2opus->basisord = basisord;

1624:   *nA = A;
1625:   PetscFunctionReturn(PETSC_SUCCESS);
1626: }

1628: /*@C
1629:   MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.

1631:   Input Parameters:
1632: + B        - the matrix to be sampled
1633: . spacedim - dimension of the space coordinates
1634: . coords   - coordinates of the points
1635: . cdist    - whether or not coordinates are distributed
1636: . eta      - admissibility condition tolerance
1637: . leafsize - leaf size in cluster tree
1638: . maxrank  - maximum rank allowed
1639: . bs       - maximum number of samples to be taken concurrently
1640: - rtol     - relative tolerance for construction

1642:   Output Parameter:
1643: . nA - matrix

1645:   Options Database Keys:
1646: + -mat_h2opus_leafsize <`PetscInt`>      - Leaf size of cluster tree
1647: . -mat_h2opus_eta <`PetscReal`>          - Admissibility condition tolerance
1648: . -mat_h2opus_maxrank <`PetscInt`>       - Maximum rank when constructed from matvecs
1649: . -mat_h2opus_samples <`PetscInt`>       - Maximum number of samples to be taken concurrently when constructing from matvecs
1650: . -mat_h2opus_rtol <`PetscReal`>         - Relative tolerance for construction from sampling
1651: . -mat_h2opus_check <`PetscBool`>        - Check error when constructing from sampling during MatAssemblyEnd()
1652: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1653: - -mat_h2opus_normsamples <`PetscInt`>   - Maximum number of samples to be when estimating norms

1655:   Level: intermediate

1657:   Note:
1658:   Not available in parallel

1660: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1661: @*/
1662: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1663: {
1664:   Mat         A;
1665:   Mat_H2OPUS *h2opus;
1666:   MPI_Comm    comm;
1667:   PetscBool   boundtocpu = PETSC_TRUE;

1669:   PetscFunctionBegin;
1678:   PetscAssertPointer(nA, 10);
1679:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1680:   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1681:   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1682:   PetscCall(MatCreate(comm, &A));
1683:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1684:   #if defined(PETSC_H2OPUS_USE_GPU)
1685:   {
1686:     VecType   vtype;
1687:     PetscBool isstd, iscuda, iskok;

1689:     PetscCall(MatGetVecType(B, &vtype));
1690:     PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1691:     PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1692:     PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1693:     PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1694:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1695:     if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1696:   }
1697:   #endif
1698:   PetscCall(MatSetType(A, MATH2OPUS));
1699:   PetscCall(MatBindToCPU(A, boundtocpu));
1700:   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1701:   PetscCall(MatPropagateSymmetryOptions(B, A));
1702:   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1704:   h2opus          = (Mat_H2OPUS *)A->data;
1705:   h2opus->sampler = new PetscMatrixSampler(B);
1706:   if (eta > 0.) h2opus->eta = eta;
1707:   if (leafsize > 0) h2opus->leafsize = leafsize;
1708:   if (maxrank > 0) h2opus->max_rank = maxrank;
1709:   if (bs > 0) h2opus->bs = bs;
1710:   if (rtol > 0.) h2opus->rtol = rtol;
1711:   *nA             = A;
1712:   A->preallocated = PETSC_TRUE;
1713:   PetscFunctionReturn(PETSC_SUCCESS);
1714: }

1716: /*@C
1717:   MatH2OpusGetIndexMap - Access reordering index set.

1719:   Input Parameter:
1720: . A - the matrix

1722:   Output Parameter:
1723: . indexmap - the index set for the reordering

1725:   Level: intermediate

1727: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1728: @*/
1729: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1730: {
1731:   PetscBool   ish2opus;
1732:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1734:   PetscFunctionBegin;
1737:   PetscAssertPointer(indexmap, 2);
1738:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1739:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1740:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1741:   *indexmap = a->h2opus_indexmap;
1742:   PetscFunctionReturn(PETSC_SUCCESS);
1743: }

1745: /*@C
1746:   MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1748:   Input Parameters:
1749: + A             - the matrix
1750: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1751: - in            - the vector to be mapped

1753:   Output Parameter:
1754: . out - the newly created mapped vector

1756:   Level: intermediate

1758: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1759: @*/
1760: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1761: {
1762:   PetscBool    ish2opus;
1763:   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1764:   PetscScalar *xin, *xout;
1765:   PetscBool    nm;

1767:   PetscFunctionBegin;
1772:   PetscAssertPointer(out, 4);
1773:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1774:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1775:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1776:   nm = a->nativemult;
1777:   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1778:   PetscCall(MatCreateVecs(A, out, NULL));
1779:   PetscCall(MatH2OpusSetNativeMult(A, nm));
1780:   if (!a->sf) { /* same ordering */
1781:     PetscCall(VecCopy(in, *out));
1782:     PetscFunctionReturn(PETSC_SUCCESS);
1783:   }
1784:   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1785:   PetscCall(VecGetArrayWrite(*out, &xout));
1786:   if (nativetopetsc) {
1787:     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1788:     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1789:   } else {
1790:     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1791:     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792:   }
1793:   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1794:   PetscCall(VecRestoreArrayWrite(*out, &xout));
1795:   PetscFunctionReturn(PETSC_SUCCESS);
1796: }

1798: /*@C
1799:   MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $

1801:   Input Parameters:
1802: + A - the hierarchical `MATH2OPUS` matrix
1803: . s - the scaling factor
1804: . U - the dense low-rank update matrix
1805: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)

1807:   Note:
1808:   The `U` and `V` matrices must be in `MATDENSE` dense format

1810:   Level: intermediate

1812: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1813: @*/
1814: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1815: {
1816:   PetscBool flg;

1818:   PetscFunctionBegin;
1821:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1823:   PetscCheckSameComm(A, 1, U, 2);
1824:   if (V) {
1826:     PetscCheckSameComm(A, 1, V, 3);
1827:   }

1830:   if (!V) V = U;
1831:   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1832:   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1833:   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1834:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1835:   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1836:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1837:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1838:   if (flg) {
1839:     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1840:     const PetscScalar *u, *v, *uu, *vv;
1841:     PetscInt           ldu, ldv;
1842:     PetscMPIInt        size;
1843:   #if defined(H2OPUS_USE_MPI)
1844:     h2opusHandle_t handle = a->handle->handle;
1845:   #else
1846:     h2opusHandle_t handle = a->handle;
1847:   #endif
1848:     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1849:     PetscSF   usf, vsf;

1851:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1852:     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1853:     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1854:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1855:     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1856:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1857:     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1858:     PetscCall(MatDenseGetLDA(U, &ldu));
1859:     PetscCall(MatDenseGetLDA(V, &ldv));
1860:     PetscCall(MatBoundToCPU(A, &flg));
1861:     if (usesf) {
1862:       PetscInt n;

1864:       PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1865:       PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1866:       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1867:       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1868:       ldu = n;
1869:       ldv = n;
1870:     }
1871:     if (flg) {
1872:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1873:       PetscCall(MatDenseGetArrayRead(U, &u));
1874:       PetscCall(MatDenseGetArrayRead(V, &v));
1875:       if (usesf) {
1876:         vv = MatH2OpusGetThrustPointer(*a->yy);
1877:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1878:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1879:         if (U != V) {
1880:           uu = MatH2OpusGetThrustPointer(*a->xx);
1881:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1882:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1883:         } else uu = vv;
1884:       } else {
1885:         uu = u;
1886:         vv = v;
1887:       }
1888:       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1889:       PetscCall(MatDenseRestoreArrayRead(U, &u));
1890:       PetscCall(MatDenseRestoreArrayRead(V, &v));
1891:     } else {
1892:   #if defined(PETSC_H2OPUS_USE_GPU)
1893:       PetscBool flgU, flgV;

1895:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1896:       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1897:       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1898:       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1899:       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1900:       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1901:       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1902:       if (usesf) {
1903:         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1904:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1905:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1906:         if (U != V) {
1907:           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1908:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1909:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1910:         } else uu = vv;
1911:       } else {
1912:         uu = u;
1913:         vv = v;
1914:       }
1915:   #else
1916:       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1917:   #endif
1918:       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1919:   #if defined(PETSC_H2OPUS_USE_GPU)
1920:       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1921:       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1922:       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1923:       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1924:   #endif
1925:     }
1926:     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1927:     a->orthogonal = PETSC_FALSE;
1928:   }
1929:   PetscFunctionReturn(PETSC_SUCCESS);
1930: }
1931: #endif