Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4:   #include <h2opus.h>
  5:   #if defined(H2OPUS_USE_MPI)
  6:     #include <h2opus/distributed/distributed_h2opus_handle.h>
  7:     #include <h2opus/distributed/distributed_geometric_construction.h>
  8:     #include <h2opus/distributed/distributed_hgemv.h>
  9:     #include <h2opus/distributed/distributed_horthog.h>
 10:     #include <h2opus/distributed/distributed_hcompress.h>
 11:   #endif
 12:   #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
 20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
 21: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
 22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
 23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);

 25:   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 27:   /* Use GPU only if H2OPUS is configured for GPU */
 28:   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 29:     #define PETSC_H2OPUS_USE_GPU
 30:   #endif
 31:   #if defined(PETSC_H2OPUS_USE_GPU)
 32:     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
 33:   #else
 34:     #define MatH2OpusUpdateIfNeeded(A, B) 0
 35:   #endif

 37: // TODO H2OPUS:
 38: // DistributedHMatrix
 39: //   unsymmetric ?
 40: //   transpose for distributed_hgemv?
 41: //   clearData()
 42: // Unify interface for sequential and parallel?
 43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 44: //
 45: template <class T>
 46: class PetscPointCloud : public H2OpusDataSet<T> {
 47: private:
 48:   int            dimension;
 49:   size_t         num_points;
 50:   std::vector<T> pts;

 52: public:
 53:   PetscPointCloud(int dim, size_t num_pts, const T coords[])
 54:   {
 55:     dim              = dim > 0 ? dim : 1;
 56:     this->dimension  = dim;
 57:     this->num_points = num_pts;

 59:     pts.resize(num_pts * dim);
 60:     if (coords) {
 61:       for (size_t n = 0; n < num_pts; n++)
 62:         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
 63:     } else {
 64:       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
 65:       for (size_t n = 0; n < num_pts; n++) {
 66:         pts[n * dim] = n * h;
 67:         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
 68:       }
 69:     }
 70:   }

 72:   PetscPointCloud(const PetscPointCloud<T> &other)
 73:   {
 74:     size_t N         = other.dimension * other.num_points;
 75:     this->dimension  = other.dimension;
 76:     this->num_points = other.num_points;
 77:     this->pts.resize(N);
 78:     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
 79:   }

 81:   int getDimension() const { return dimension; }

 83:   size_t getDataSetSize() const { return num_points; }

 85:   T getDataPoint(size_t idx, int dim) const
 86:   {
 87:     assert(dim < dimension && idx < num_points);
 88:     return pts[idx * dimension + dim];
 89:   }

 91:   void Print(std::ostream &out = std::cout)
 92:   {
 93:     out << "Dimension: " << dimension << std::endl;
 94:     out << "NumPoints: " << num_points << std::endl;
 95:     for (size_t n = 0; n < num_points; n++) {
 96:       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
 97:       out << std::endl;
 98:     }
 99:   }
100: };

102: template <class T>
103: class PetscFunctionGenerator {
104: private:
105:   MatH2OpusKernel k;
106:   int             dim;
107:   void           *ctx;

109: public:
110:   PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111:   {
112:     this->k   = k;
113:     this->dim = dim;
114:     this->ctx = ctx;
115:   }
116:   PetscFunctionGenerator(PetscFunctionGenerator &other)
117:   {
118:     this->k   = other.k;
119:     this->dim = other.dim;
120:     this->ctx = other.ctx;
121:   }
122:   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123: };

125: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

127:   /* just to not clutter the code */
128:   #if !defined(H2OPUS_USE_GPU)
129: typedef HMatrix HMatrix_GPU;
130:     #if defined(H2OPUS_USE_MPI)
131: typedef DistributedHMatrix DistributedHMatrix_GPU;
132:     #endif
133:   #endif

135: typedef struct {
136:   #if defined(H2OPUS_USE_MPI)
137:   distributedH2OpusHandle_t handle;
138:   #else
139:   h2opusHandle_t handle;
140:   #endif
141:   /* Sequential and parallel matrices are two different classes at the moment */
142:   HMatrix *hmatrix;
143:   #if defined(H2OPUS_USE_MPI)
144:   DistributedHMatrix *dist_hmatrix;
145:   #else
146:   HMatrix       *dist_hmatrix;     /* just to not clutter the code */
147:   #endif
148:   /* May use permutations */
149:   PetscSF                           sf;
150:   PetscLayout                       h2opus_rmap, h2opus_cmap;
151:   IS                                h2opus_indexmap;
152:   thrust::host_vector<PetscScalar> *xx, *yy;
153:   PetscInt                          xxs, yys;
154:   PetscBool                         multsetup;

156:   /* GPU */
157:   HMatrix_GPU *hmatrix_gpu;
158:   #if defined(H2OPUS_USE_MPI)
159:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
160:   #else
161:   HMatrix_GPU   *dist_hmatrix_gpu; /* just to not clutter the code */
162:   #endif
163:   #if defined(PETSC_H2OPUS_USE_GPU)
164:   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165:   PetscInt                            xxs_gpu, yys_gpu;
166:   #endif

168:   /* construction from matvecs */
169:   PetscMatrixSampler *sampler;
170:   PetscBool           nativemult;

172:   /* Admissibility */
173:   PetscReal eta;
174:   PetscInt  leafsize;

176:   /* for dof reordering */
177:   PetscPointCloud<PetscReal> *ptcloud;

179:   /* kernel for generating matrix entries */
180:   PetscFunctionGenerator<PetscScalar> *kernel;

182:   /* basis orthogonalized? */
183:   PetscBool orthogonal;

185:   /* customization */
186:   PetscInt  basisord;
187:   PetscInt  max_rank;
188:   PetscInt  bs;
189:   PetscReal rtol;
190:   PetscInt  norm_max_samples;
191:   PetscBool check_construction;
192:   PetscBool hara_verbose;
193:   PetscBool resize;

195:   /* keeps track of MatScale values */
196:   PetscScalar s;
197: } Mat_H2OPUS;

199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

203:   #if defined(H2OPUS_USE_MPI)
204:   h2opusDestroyDistributedHandle(a->handle);
205:   #else
206:   h2opusDestroyHandle(a->handle);
207:   #endif
208:   delete a->dist_hmatrix;
209:   delete a->hmatrix;
210:   PetscSFDestroy(&a->sf);
211:   PetscLayoutDestroy(&a->h2opus_rmap);
212:   PetscLayoutDestroy(&a->h2opus_cmap);
213:   ISDestroy(&a->h2opus_indexmap);
214:   delete a->xx;
215:   delete a->yy;
216:   delete a->hmatrix_gpu;
217:   delete a->dist_hmatrix_gpu;
218:   #if defined(PETSC_H2OPUS_USE_GPU)
219:   delete a->xx_gpu;
220:   delete a->yy_gpu;
221:   #endif
222:   delete a->sampler;
223:   delete a->ptcloud;
224:   delete a->kernel;
225:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL);
226:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL);
227:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL);
228:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL);
229:   PetscObjectChangeTypeName((PetscObject)A, NULL);
230:   PetscFree(A->data);
231:   return 0;
232: }

234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237:   PetscBool   ish2opus;

241:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
242:   if (ish2opus) {
243:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245:         PetscLayout t;
246:         t              = A->rmap;
247:         A->rmap        = a->h2opus_rmap;
248:         a->h2opus_rmap = t;
249:         t              = A->cmap;
250:         A->cmap        = a->h2opus_cmap;
251:         a->h2opus_cmap = t;
252:       }
253:     }
254:     a->nativemult = nm;
255:   }
256:   return 0;
257: }

259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
262:   PetscBool   ish2opus;

266:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
268:   *nm = a->nativemult;
269:   return 0;
270: }

272: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
273: {
274:   PetscBool   ish2opus;
275:   PetscInt    nmax = PETSC_DECIDE;
276:   Mat_H2OPUS *a    = NULL;
277:   PetscBool   mult = PETSC_FALSE;

279:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
280:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
281:     a = (Mat_H2OPUS *)A->data;

283:     nmax = a->norm_max_samples;
284:     mult = a->nativemult;
285:     MatH2OpusSetNativeMult(A, PETSC_TRUE);
286:   } else {
287:     PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL);
288:   }
289:   MatApproximateNorm_Private(A, normtype, nmax, n);
290:   if (a) MatH2OpusSetNativeMult(A, mult);
291:   return 0;
292: }

294: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
295: {
296:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
297:   PetscInt    n;
298:   PetscBool   boundtocpu = PETSC_TRUE;

300:   #if defined(PETSC_H2OPUS_USE_GPU)
301:   boundtocpu = A->boundtocpu;
302:   #endif
303:   PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
304:   if (boundtocpu) {
305:     if (h2opus->xxs < xN) {
306:       h2opus->xx->resize(n * xN);
307:       h2opus->xxs = xN;
308:     }
309:     if (h2opus->yys < yN) {
310:       h2opus->yy->resize(n * yN);
311:       h2opus->yys = yN;
312:     }
313:   }
314:   #if defined(PETSC_H2OPUS_USE_GPU)
315:   if (!boundtocpu) {
316:     if (h2opus->xxs_gpu < xN) {
317:       h2opus->xx_gpu->resize(n * xN);
318:       h2opus->xxs_gpu = xN;
319:     }
320:     if (h2opus->yys_gpu < yN) {
321:       h2opus->yy_gpu->resize(n * yN);
322:       h2opus->yys_gpu = yN;
323:     }
324:   }
325:   #endif
326:   return 0;
327: }

329: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
330: {
331:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
332:   #if defined(H2OPUS_USE_MPI)
333:   h2opusHandle_t handle = h2opus->handle->handle;
334:   #else
335:   h2opusHandle_t handle = h2opus->handle;
336:   #endif
337:   PetscBool    boundtocpu = PETSC_TRUE;
338:   PetscScalar *xx, *yy, *uxx, *uyy;
339:   PetscInt     blda, clda;
340:   PetscMPIInt  size;
341:   PetscSF      bsf, csf;
342:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

344:   HLibProfile::clear();
345:   #if defined(PETSC_H2OPUS_USE_GPU)
346:   boundtocpu = A->boundtocpu;
347:   #endif
348:   MatDenseGetLDA(B, &blda);
349:   MatDenseGetLDA(C, &clda);
350:   if (usesf) {
351:     PetscInt n;

353:     MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf);
354:     MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf);

356:     MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N);
357:     PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
358:     blda = n;
359:     clda = n;
360:   }
361:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
362:   if (boundtocpu) {
363:     MatDenseGetArrayRead(B, (const PetscScalar **)&xx);
364:     MatDenseGetArrayWrite(C, &yy);
365:     if (usesf) {
366:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
367:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
368:       PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
369:       PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
370:     } else {
371:       uxx = xx;
372:       uyy = yy;
373:     }
374:     if (size > 1) {
377:   #if defined(H2OPUS_USE_MPI)
378:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
379:   #endif
380:     } else {
382:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
383:     }
384:     MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx);
385:     if (usesf) {
386:       PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
387:       PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
388:     }
389:     MatDenseRestoreArrayWrite(C, &yy);
390:   #if defined(PETSC_H2OPUS_USE_GPU)
391:   } else {
392:     PetscBool ciscuda, biscuda;

394:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
395:     PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
396:     if (!biscuda) MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B);
397:     PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
398:     if (!ciscuda) {
399:       C->assembled = PETSC_TRUE;
400:       MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C);
401:     }
402:     MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx);
403:     MatDenseCUDAGetArrayWrite(C, &yy);
404:     if (usesf) {
405:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
406:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
407:       PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
408:       PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
409:     } else {
410:       uxx = xx;
411:       uyy = yy;
412:     }
413:     PetscLogGpuTimeBegin();
414:     if (size > 1) {
417:     #if defined(H2OPUS_USE_MPI)
418:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
419:     #endif
420:     } else {
422:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
423:     }
424:     PetscLogGpuTimeEnd();
425:     MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx);
426:     if (usesf) {
427:       PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
428:       PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
429:     }
430:     MatDenseCUDARestoreArrayWrite(C, &yy);
431:     if (!biscuda) MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B);
432:     if (!ciscuda) MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C);
433:   #endif
434:   }
435:   { /* log flops */
436:     double gops, time, perf, dev;
437:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
438:   #if defined(PETSC_H2OPUS_USE_GPU)
439:     if (boundtocpu) {
440:       PetscLogFlops(1e9 * gops);
441:     } else {
442:       PetscLogGpuFlops(1e9 * gops);
443:     }
444:   #else
445:     PetscLogFlops(1e9 * gops);
446:   #endif
447:   }
448:   return 0;
449: }

451: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452: {
453:   Mat_Product *product = C->product;

455:   MatCheckProduct(C, 1);
456:   switch (product->type) {
457:   case MATPRODUCT_AB:
458:     MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C);
459:     break;
460:   case MATPRODUCT_AtB:
461:     MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C);
462:     break;
463:   default:
464:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
465:   }
466:   return 0;
467: }

469: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
470: {
471:   Mat_Product *product = C->product;
472:   PetscBool    cisdense;
473:   Mat          A, B;

475:   MatCheckProduct(C, 1);
476:   A = product->A;
477:   B = product->B;
478:   switch (product->type) {
479:   case MATPRODUCT_AB:
480:     MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N);
481:     MatSetBlockSizesFromMats(C, product->A, product->B);
482:     PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
483:     if (!cisdense) MatSetType(C, ((PetscObject)product->B)->type_name);
484:     MatSetUp(C);
485:     break;
486:   case MATPRODUCT_AtB:
487:     MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N);
488:     MatSetBlockSizesFromMats(C, product->A, product->B);
489:     PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "");
490:     if (!cisdense) MatSetType(C, ((PetscObject)product->B)->type_name);
491:     MatSetUp(C);
492:     break;
493:   default:
494:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
495:   }
496:   C->ops->productsymbolic = NULL;
497:   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
498:   return 0;
499: }

501: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
502: {
503:   MatCheckProduct(C, 1);
504:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
505:   return 0;
506: }

508: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
509: {
510:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
511:   #if defined(H2OPUS_USE_MPI)
512:   h2opusHandle_t handle = h2opus->handle->handle;
513:   #else
514:   h2opusHandle_t handle = h2opus->handle;
515:   #endif
516:   PetscBool    boundtocpu = PETSC_TRUE;
517:   PetscInt     n;
518:   PetscScalar *xx, *yy, *uxx, *uyy;
519:   PetscMPIInt  size;
520:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

522:   HLibProfile::clear();
523:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
524:   #if defined(PETSC_H2OPUS_USE_GPU)
525:   boundtocpu = A->boundtocpu;
526:   #endif
527:   if (usesf) {
528:     PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL);
529:   } else n = A->rmap->n;
530:   if (boundtocpu) {
531:     VecGetArrayRead(x, (const PetscScalar **)&xx);
532:     if (sy == 0.0) {
533:       VecGetArrayWrite(y, &yy);
534:     } else {
535:       VecGetArray(y, &yy);
536:     }
537:     if (usesf) {
538:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
539:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

541:       PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
542:       PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
543:       if (sy != 0.0) {
544:         PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
545:         PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
546:       }
547:     } else {
548:       uxx = xx;
549:       uyy = yy;
550:     }
551:     if (size > 1) {
554:   #if defined(H2OPUS_USE_MPI)
555:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
556:   #endif
557:     } else {
559:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
560:     }
561:     VecRestoreArrayRead(x, (const PetscScalar **)&xx);
562:     if (usesf) {
563:       PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
564:       PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
565:     }
566:     if (sy == 0.0) {
567:       VecRestoreArrayWrite(y, &yy);
568:     } else {
569:       VecRestoreArray(y, &yy);
570:     }
571:   #if defined(PETSC_H2OPUS_USE_GPU)
572:   } else {
573:     VecCUDAGetArrayRead(x, (const PetscScalar **)&xx);
574:     if (sy == 0.0) {
575:       VecCUDAGetArrayWrite(y, &yy);
576:     } else {
577:       VecCUDAGetArray(y, &yy);
578:     }
579:     if (usesf) {
580:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
581:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

583:       PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
584:       PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE);
585:       if (sy != 0.0) {
586:         PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
587:         PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE);
588:       }
589:     } else {
590:       uxx = xx;
591:       uyy = yy;
592:     }
593:     PetscLogGpuTimeBegin();
594:     if (size > 1) {
597:     #if defined(H2OPUS_USE_MPI)
598:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
599:     #endif
600:     } else {
602:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
603:     }
604:     PetscLogGpuTimeEnd();
605:     VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx);
606:     if (usesf) {
607:       PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
608:       PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE);
609:     }
610:     if (sy == 0.0) {
611:       VecCUDARestoreArrayWrite(y, &yy);
612:     } else {
613:       VecCUDARestoreArray(y, &yy);
614:     }
615:   #endif
616:   }
617:   { /* log flops */
618:     double gops, time, perf, dev;
619:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
620:   #if defined(PETSC_H2OPUS_USE_GPU)
621:     if (boundtocpu) {
622:       PetscLogFlops(1e9 * gops);
623:     } else {
624:       PetscLogGpuFlops(1e9 * gops);
625:     }
626:   #else
627:     PetscLogFlops(1e9 * gops);
628:   #endif
629:   }
630:   return 0;
631: }

633: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
634: {
635:   PetscBool xiscuda, yiscuda;

637:   PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
638:   PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
639:   MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda);
640:   MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE);
641:   return 0;
642: }

644: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
645: {
646:   PetscBool xiscuda, yiscuda;

648:   PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
649:   PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
650:   MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda);
651:   MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE);
652:   return 0;
653: }

655: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
656: {
657:   PetscBool xiscuda, ziscuda;

659:   VecCopy(y, z);
660:   PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
661:   PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "");
662:   MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda);
663:   MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE);
664:   return 0;
665: }

667: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668: {
669:   PetscBool xiscuda, ziscuda;

671:   VecCopy(y, z);
672:   PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
673:   PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "");
674:   MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda);
675:   MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE);
676:   return 0;
677: }

679: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
680: {
681:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

683:   a->s *= s;
684:   return 0;
685: }

687: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
688: {
689:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

691:   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
692:   PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL);
693:   PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL);
694:   PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL);
695:   PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL);
696:   PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL);
697:   PetscOptionsInt("-mat_h2opus_normsamples", "Maximum bumber of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL);
698:   PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL);
699:   PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL);
700:   PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL);
701:   PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL);
702:   PetscOptionsHeadEnd();
703:   return 0;
704: }

706: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);

708: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
709: {
710:   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
711:   Vec                c;
712:   PetscInt           spacedim;
713:   const PetscScalar *coords;

715:   if (a->ptcloud) return 0;
716:   PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c);
717:   if (!c && a->sampler) {
718:     Mat S = a->sampler->GetSamplingMat();

720:     PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c);
721:   }
722:   if (!c) {
723:     MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL);
724:   } else {
725:     VecGetArrayRead(c, &coords);
726:     VecGetBlockSize(c, &spacedim);
727:     MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL);
728:     VecRestoreArrayRead(c, &coords);
729:   }
730:   return 0;
731: }

733: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
734: {
735:   MPI_Comm      comm;
736:   PetscMPIInt   size;
737:   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
738:   PetscInt      n = 0, *idx = NULL;
739:   int          *iidx = NULL;
740:   PetscCopyMode own;
741:   PetscBool     rid;

743:   if (a->multsetup) return 0;
744:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
745:     PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL);
746:   #if defined(PETSC_H2OPUS_USE_GPU)
747:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
748:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
749:     a->xxs_gpu = 1;
750:     a->yys_gpu = 1;
751:   #endif
752:     a->xx  = new thrust::host_vector<PetscScalar>(n);
753:     a->yy  = new thrust::host_vector<PetscScalar>(n);
754:     a->xxs = 1;
755:     a->yys = 1;
756:   } else {
757:     IS is;
758:     PetscObjectGetComm((PetscObject)A, &comm);
759:     MPI_Comm_size(comm, &size);
760:     if (!a->h2opus_indexmap) {
761:       if (size > 1) {
763:   #if defined(H2OPUS_USE_MPI)
764:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
765:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
766:   #endif
767:       } else {
768:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
769:         n    = a->hmatrix->u_basis_tree.index_map.size();
770:       }

772:       if (PetscDefined(USE_64BIT_INDICES)) {
773:         PetscInt i;

775:         own = PETSC_OWN_POINTER;
776:         PetscMalloc1(n, &idx);
777:         for (i = 0; i < n; i++) idx[i] = iidx[i];
778:       } else {
779:         own = PETSC_COPY_VALUES;
780:         idx = (PetscInt *)iidx;
781:       }
782:       ISCreateGeneral(comm, n, idx, own, &is);
783:       ISSetPermutation(is);
784:       ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view");
785:       a->h2opus_indexmap = is;
786:     }
787:     ISGetLocalSize(a->h2opus_indexmap, &n);
788:     ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx);
789:     rid = (PetscBool)(n == A->rmap->n);
790:     MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm);
791:     if (rid) ISIdentity(a->h2opus_indexmap, &rid);
792:     if (!rid) {
793:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
794:         PetscLayoutCreate(comm, &a->h2opus_rmap);
795:         PetscLayoutSetLocalSize(a->h2opus_rmap, n);
796:         PetscLayoutSetUp(a->h2opus_rmap);
797:         PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap);
798:       }
799:       PetscSFCreate(comm, &a->sf);
800:       PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx);
801:       PetscSFSetUp(a->sf);
802:       PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view");
803:   #if defined(PETSC_H2OPUS_USE_GPU)
804:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
805:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
806:       a->xxs_gpu = 1;
807:       a->yys_gpu = 1;
808:   #endif
809:       a->xx  = new thrust::host_vector<PetscScalar>(n);
810:       a->yy  = new thrust::host_vector<PetscScalar>(n);
811:       a->xxs = 1;
812:       a->yys = 1;
813:     }
814:     ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx);
815:   }
816:   a->multsetup = PETSC_TRUE;
817:   return 0;
818: }

820: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
821: {
822:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
823:   #if defined(H2OPUS_USE_MPI)
824:   h2opusHandle_t handle = a->handle->handle;
825:   #else
826:   h2opusHandle_t handle = a->handle;
827:   #endif
828:   PetscBool   kernel       = PETSC_FALSE;
829:   PetscBool   boundtocpu   = PETSC_TRUE;
830:   PetscBool   samplingdone = PETSC_FALSE;
831:   MPI_Comm    comm;
832:   PetscMPIInt size;

834:   PetscObjectGetComm((PetscObject)A, &comm);

838:   /* XXX */
839:   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));

841:   MPI_Comm_size(comm, &size);
842:   /* TODO REUSABILITY of geometric construction */
843:   delete a->hmatrix;
844:   delete a->dist_hmatrix;
845:   #if defined(PETSC_H2OPUS_USE_GPU)
846:   delete a->hmatrix_gpu;
847:   delete a->dist_hmatrix_gpu;
848:   #endif
849:   a->orthogonal = PETSC_FALSE;

851:   /* TODO: other? */
852:   H2OpusBoxCenterAdmissibility adm(a->eta);

854:   PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0);
855:   if (size > 1) {
856:   #if defined(H2OPUS_USE_MPI)
857:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
858:   #else
859:     a->dist_hmatrix = NULL;
860:   #endif
861:   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
862:   MatH2OpusInferCoordinates_Private(A);
864:   if (a->kernel) {
865:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
866:     if (size > 1) {
868:   #if defined(H2OPUS_USE_MPI)
869:       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
870:   #endif
871:     } else {
872:       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
873:     }
874:     kernel = PETSC_TRUE;
875:   } else {
877:     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
878:   }
879:   MatSetUpMultiply_H2OPUS(A);

881:   #if defined(PETSC_H2OPUS_USE_GPU)
882:   boundtocpu = A->boundtocpu;
883:   if (!boundtocpu) {
884:     if (size > 1) {
886:     #if defined(H2OPUS_USE_MPI)
887:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
888:     #endif
889:     } else {
890:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
891:     }
892:   }
893:   #endif
894:   if (size == 1) {
895:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
896:       PetscReal Anorm;
897:       bool      verbose;

899:       PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL);
900:       verbose = a->hara_verbose;
901:       MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm);
902:       if (a->hara_verbose) PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs);
903:       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
904:       a->sampler->SetStream(handle->getMainStream());
905:       if (boundtocpu) {
906:         a->sampler->SetGPUSampling(false);
907:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
908:   #if defined(PETSC_H2OPUS_USE_GPU)
909:       } else {
910:         a->sampler->SetGPUSampling(true);
911:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
912:   #endif
913:       }
914:       samplingdone = PETSC_TRUE;
915:     }
916:   }
917:   #if defined(PETSC_H2OPUS_USE_GPU)
918:   if (!boundtocpu) {
919:     delete a->hmatrix;
920:     delete a->dist_hmatrix;
921:     a->hmatrix      = NULL;
922:     a->dist_hmatrix = NULL;
923:   }
924:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
925:   #endif
926:   PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0);

928:   if (!a->s) a->s = 1.0;
929:   A->assembled = PETSC_TRUE;

931:   if (samplingdone) {
932:     PetscBool check  = a->check_construction;
933:     PetscBool checke = PETSC_FALSE;

935:     PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL);
936:     PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL);
937:     if (check) {
938:       Mat       E, Ae;
939:       PetscReal n1, ni, n2;
940:       PetscReal n1A, niA, n2A;
941:       void (*normfunc)(void);

943:       Ae = a->sampler->GetSamplingMat();
944:       MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E);
945:       MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS);
946:       MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN);
947:       MatNorm(E, NORM_1, &n1);
948:       MatNorm(E, NORM_INFINITY, &ni);
949:       MatNorm(E, NORM_2, &n2);
950:       if (checke) {
951:         Mat eA, eE, eAe;

953:         MatComputeOperator(A, MATAIJ, &eA);
954:         MatComputeOperator(E, MATAIJ, &eE);
955:         MatComputeOperator(Ae, MATAIJ, &eAe);
956:         MatChop(eA, PETSC_SMALL);
957:         MatChop(eE, PETSC_SMALL);
958:         MatChop(eAe, PETSC_SMALL);
959:         PetscObjectSetName((PetscObject)eA, "H2Mat");
960:         MatView(eA, NULL);
961:         PetscObjectSetName((PetscObject)eAe, "S");
962:         MatView(eAe, NULL);
963:         PetscObjectSetName((PetscObject)eE, "H2Mat - S");
964:         MatView(eE, NULL);
965:         MatDestroy(&eA);
966:         MatDestroy(&eE);
967:         MatDestroy(&eAe);
968:       }

970:       MatGetOperation(Ae, MATOP_NORM, &normfunc);
971:       MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS);
972:       MatNorm(Ae, NORM_1, &n1A);
973:       MatNorm(Ae, NORM_INFINITY, &niA);
974:       MatNorm(Ae, NORM_2, &n2A);
975:       n1A = PetscMax(n1A, PETSC_SMALL);
976:       n2A = PetscMax(n2A, PETSC_SMALL);
977:       niA = PetscMax(niA, PETSC_SMALL);
978:       MatSetOperation(Ae, MATOP_NORM, normfunc);
979:       PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A));
980:       MatDestroy(&E);
981:     }
982:     a->sampler->SetSamplingMat(NULL);
983:   }
984:   return 0;
985: }

987: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
988: {
989:   PetscMPIInt size;
990:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

992:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
994:   a->hmatrix->clearData();
995:   #if defined(PETSC_H2OPUS_USE_GPU)
996:   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
997:   #endif
998:   return 0;
999: }

1001: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1002: {
1003:   Mat         A;
1004:   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1005:   #if defined(PETSC_H2OPUS_USE_GPU)
1006:   PetscBool iscpu = PETSC_FALSE;
1007:   #else
1008:   PetscBool iscpu = PETSC_TRUE;
1009:   #endif
1010:   MPI_Comm comm;

1012:   PetscObjectGetComm((PetscObject)B, &comm);
1013:   MatCreate(comm, &A);
1014:   MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N);
1015:   MatSetType(A, MATH2OPUS);
1016:   MatPropagateSymmetryOptions(B, A);
1017:   a = (Mat_H2OPUS *)A->data;

1019:   a->eta              = b->eta;
1020:   a->leafsize         = b->leafsize;
1021:   a->basisord         = b->basisord;
1022:   a->max_rank         = b->max_rank;
1023:   a->bs               = b->bs;
1024:   a->rtol             = b->rtol;
1025:   a->norm_max_samples = b->norm_max_samples;
1026:   if (op == MAT_COPY_VALUES) a->s = b->s;

1028:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1029:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1031:   #if defined(H2OPUS_USE_MPI)
1032:   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1033:     #if defined(PETSC_H2OPUS_USE_GPU)
1034:   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1035:     #endif
1036:   #endif
1037:   if (b->hmatrix) {
1038:     a->hmatrix = new HMatrix(*b->hmatrix);
1039:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1040:   }
1041:   #if defined(PETSC_H2OPUS_USE_GPU)
1042:   if (b->hmatrix_gpu) {
1043:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1044:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1045:   }
1046:   #endif
1047:   if (b->sf) {
1048:     PetscObjectReference((PetscObject)b->sf);
1049:     a->sf = b->sf;
1050:   }
1051:   if (b->h2opus_indexmap) {
1052:     PetscObjectReference((PetscObject)b->h2opus_indexmap);
1053:     a->h2opus_indexmap = b->h2opus_indexmap;
1054:   }

1056:   MatSetUp(A);
1057:   MatSetUpMultiply_H2OPUS(A);
1058:   if (op == MAT_COPY_VALUES) {
1059:     A->assembled  = PETSC_TRUE;
1060:     a->orthogonal = b->orthogonal;
1061:   #if defined(PETSC_H2OPUS_USE_GPU)
1062:     A->offloadmask = B->offloadmask;
1063:   #endif
1064:   }
1065:   #if defined(PETSC_H2OPUS_USE_GPU)
1066:   iscpu = B->boundtocpu;
1067:   #endif
1068:   MatBindToCPU(A, iscpu);

1070:   *nA = A;
1071:   return 0;
1072: }

1074: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1075: {
1076:   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1077:   PetscBool         isascii, vieweps;
1078:   PetscMPIInt       size;
1079:   PetscViewerFormat format;

1081:   PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii);
1082:   PetscViewerGetFormat(view, &format);
1083:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1084:   if (isascii) {
1085:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1086:       if (size == 1) {
1087:         FILE *fp;
1088:         PetscViewerASCIIGetPointer(view, &fp);
1089:         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1090:       }
1091:     } else {
1092:       PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat");
1093:       PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1094:       PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta);
1095:       if (!h2opus->kernel) {
1096:         PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol);
1097:       } else {
1098:         PetscViewerASCIIPrintf(view, "  Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord);
1099:       }
1100:       PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples);
1101:       if (size == 1) {
1102:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1103:         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1104:   #if defined(PETSC_HAVE_CUDA)
1105:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1106:         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1107:   #endif
1108:         PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1109:   #if defined(PETSC_HAVE_CUDA)
1110:         PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1111:   #endif
1112:       } else {
1113:   #if defined(PETSC_HAVE_CUDA)
1114:         double      matrix_mem[4] = {0., 0., 0., 0.};
1115:         PetscMPIInt rsize         = 4;
1116:   #else
1117:         double      matrix_mem[2] = {0., 0.};
1118:         PetscMPIInt rsize         = 2;
1119:   #endif
1120:   #if defined(H2OPUS_USE_MPI)
1121:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1122:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1123:     #if defined(PETSC_HAVE_CUDA)
1124:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1125:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1126:     #endif
1127:   #endif
1128:         MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A));
1129:         PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1130:   #if defined(PETSC_HAVE_CUDA)
1131:         PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1132:   #endif
1133:       }
1134:     }
1135:   }
1136:   vieweps = PETSC_FALSE;
1137:   PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL);
1138:   if (vieweps) {
1139:     char        filename[256];
1140:     const char *name;

1142:     PetscObjectGetName((PetscObject)A, &name);
1143:     PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name);
1144:     PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL);
1145:     outputEps(*h2opus->hmatrix, filename);
1146:   }
1147:   return 0;
1148: }

1150: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1151: {
1152:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1153:   PetscReal  *gcoords;
1154:   PetscInt    N;
1155:   MPI_Comm    comm;
1156:   PetscMPIInt size;
1157:   PetscBool   cong;

1159:   PetscLayoutSetUp(A->rmap);
1160:   PetscLayoutSetUp(A->cmap);
1161:   PetscObjectGetComm((PetscObject)A, &comm);
1162:   MatHasCongruentLayouts(A, &cong);
1164:   N = A->rmap->N;
1165:   MPI_Comm_size(comm, &size);
1166:   if (spacedim > 0 && size > 1 && cdist) {
1167:     PetscSF      sf;
1168:     MPI_Datatype dtype;

1170:     MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype);
1171:     MPI_Type_commit(&dtype);

1173:     PetscSFCreate(comm, &sf);
1174:     PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER);
1175:     PetscMalloc1(spacedim * N, &gcoords);
1176:     PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE);
1177:     PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE);
1178:     PetscSFDestroy(&sf);
1179:     MPI_Type_free(&dtype);
1180:   } else gcoords = (PetscReal *)coords;

1182:   delete h2opus->ptcloud;
1183:   delete h2opus->kernel;
1184:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1185:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1186:   if (gcoords != coords) PetscFree(gcoords);
1187:   A->preallocated = PETSC_TRUE;
1188:   return 0;
1189: }

1191:   #if defined(PETSC_H2OPUS_USE_GPU)
1192: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1193: {
1194:   PetscMPIInt size;
1195:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1197:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1198:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1199:     if (size > 1) {
1201:     #if defined(H2OPUS_USE_MPI)
1202:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1203:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1204:     #endif
1205:     } else {
1207:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1208:       else *a->hmatrix = *a->hmatrix_gpu;
1209:     }
1210:     delete a->hmatrix_gpu;
1211:     delete a->dist_hmatrix_gpu;
1212:     a->hmatrix_gpu      = NULL;
1213:     a->dist_hmatrix_gpu = NULL;
1214:     A->offloadmask      = PETSC_OFFLOAD_CPU;
1215:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1216:     if (size > 1) {
1218:     #if defined(H2OPUS_USE_MPI)
1219:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1220:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1221:     #endif
1222:     } else {
1224:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1225:       else *a->hmatrix_gpu = *a->hmatrix;
1226:     }
1227:     delete a->hmatrix;
1228:     delete a->dist_hmatrix;
1229:     a->hmatrix      = NULL;
1230:     a->dist_hmatrix = NULL;
1231:     A->offloadmask  = PETSC_OFFLOAD_GPU;
1232:   }
1233:   PetscFree(A->defaultvectype);
1234:   if (!flg) {
1235:     PetscStrallocpy(VECCUDA, &A->defaultvectype);
1236:   } else {
1237:     PetscStrallocpy(VECSTANDARD, &A->defaultvectype);
1238:   }
1239:   A->boundtocpu = flg;
1240:   return 0;
1241: }
1242:   #endif

1244: /*MC
1245:      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.

1247:    Options Database Keys:
1248: .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()

1250:    Notes:
1251:      H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs.

1253:      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1254:      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.

1256:    Level: beginner

1258:    Reference:
1259: .  * -  "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451

1261: .seealso: `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1262: M*/
1263: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1264: {
1265:   Mat_H2OPUS *a;
1266:   PetscMPIInt size;

1268:   #if defined(PETSC_H2OPUS_USE_GPU)
1269:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1270:   #endif
1271:   PetscNew(&a);
1272:   A->data = (void *)a;

1274:   a->eta              = 0.9;
1275:   a->leafsize         = 32;
1276:   a->basisord         = 4;
1277:   a->max_rank         = 64;
1278:   a->bs               = 32;
1279:   a->rtol             = 1.e-4;
1280:   a->s                = 1.0;
1281:   a->norm_max_samples = 10;
1282:   a->resize           = PETSC_TRUE; /* reallocate after compression */
1283:   #if defined(H2OPUS_USE_MPI)
1284:   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1285:   #else
1286:   h2opusCreateHandle(&a->handle);
1287:   #endif
1288:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1289:   PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS);
1290:   PetscMemzero(A->ops, sizeof(struct _MatOps));

1292:   A->ops->destroy          = MatDestroy_H2OPUS;
1293:   A->ops->view             = MatView_H2OPUS;
1294:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1295:   A->ops->mult             = MatMult_H2OPUS;
1296:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1297:   A->ops->multadd          = MatMultAdd_H2OPUS;
1298:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1299:   A->ops->scale            = MatScale_H2OPUS;
1300:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1301:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1302:   A->ops->norm             = MatNorm_H2OPUS;
1303:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1304:   #if defined(PETSC_H2OPUS_USE_GPU)
1305:   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1306:   #endif

1308:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS);
1309:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS);
1310:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS);
1311:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS);
1312:   #if defined(PETSC_H2OPUS_USE_GPU)
1313:   PetscFree(A->defaultvectype);
1314:   PetscStrallocpy(VECCUDA, &A->defaultvectype);
1315:   #endif
1316:   return 0;
1317: }

1319: /*@C
1320:      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1322:    Input Parameter:
1323: .     A - the matrix

1325:    Level: intermediate

1327: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1328: @*/
1329: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1330: {
1331:   PetscBool   ish2opus;
1332:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1333:   PetscMPIInt size;
1334:   PetscBool   boundtocpu = PETSC_TRUE;

1338:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1339:   if (!ish2opus) return 0;
1340:   if (a->orthogonal) return 0;
1341:   HLibProfile::clear();
1342:   PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0);
1343:   #if defined(PETSC_H2OPUS_USE_GPU)
1344:   boundtocpu = A->boundtocpu;
1345:   #endif
1346:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1347:   if (size > 1) {
1348:     if (boundtocpu) {
1350:   #if defined(H2OPUS_USE_MPI)
1351:       distributed_horthog(*a->dist_hmatrix, a->handle);
1352:   #endif
1353:   #if defined(PETSC_H2OPUS_USE_GPU)
1354:       A->offloadmask = PETSC_OFFLOAD_CPU;
1355:     } else {
1357:       PetscLogGpuTimeBegin();
1358:     #if defined(H2OPUS_USE_MPI)
1359:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1360:     #endif
1361:       PetscLogGpuTimeEnd();
1362:   #endif
1363:     }
1364:   } else {
1365:   #if defined(H2OPUS_USE_MPI)
1366:     h2opusHandle_t handle = a->handle->handle;
1367:   #else
1368:     h2opusHandle_t handle = a->handle;
1369:   #endif
1370:     if (boundtocpu) {
1372:       horthog(*a->hmatrix, handle);
1373:   #if defined(PETSC_H2OPUS_USE_GPU)
1374:       A->offloadmask = PETSC_OFFLOAD_CPU;
1375:     } else {
1377:       PetscLogGpuTimeBegin();
1378:       horthog(*a->hmatrix_gpu, handle);
1379:       PetscLogGpuTimeEnd();
1380:   #endif
1381:     }
1382:   }
1383:   a->orthogonal = PETSC_TRUE;
1384:   { /* log flops */
1385:     double gops, time, perf, dev;
1386:     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1387:   #if defined(PETSC_H2OPUS_USE_GPU)
1388:     if (boundtocpu) {
1389:       PetscLogFlops(1e9 * gops);
1390:     } else {
1391:       PetscLogGpuFlops(1e9 * gops);
1392:     }
1393:   #else
1394:     PetscLogFlops(1e9 * gops);
1395:   #endif
1396:   }
1397:   PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0);
1398:   return 0;
1399: }

1401: /*@C
1402:      MatH2OpusCompress - Compress a hierarchical matrix.

1404:    Input Parameters:
1405: +     A - the matrix
1406: -     tol - the absolute truncation threshold

1408:    Level: intermediate

1410: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1411: @*/
1412: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1413: {
1414:   PetscBool   ish2opus;
1415:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1416:   PetscMPIInt size;
1417:   PetscBool   boundtocpu = PETSC_TRUE;

1422:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1423:   if (!ish2opus || tol <= 0.0) return 0;
1424:   MatH2OpusOrthogonalize(A);
1425:   HLibProfile::clear();
1426:   PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0);
1427:   #if defined(PETSC_H2OPUS_USE_GPU)
1428:   boundtocpu = A->boundtocpu;
1429:   #endif
1430:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1431:   if (size > 1) {
1432:     if (boundtocpu) {
1434:   #if defined(H2OPUS_USE_MPI)
1435:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1436:       if (a->resize) {
1437:         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1438:         delete a->dist_hmatrix;
1439:         a->dist_hmatrix = dist_hmatrix;
1440:       }
1441:   #endif
1442:   #if defined(PETSC_H2OPUS_USE_GPU)
1443:       A->offloadmask = PETSC_OFFLOAD_CPU;
1444:     } else {
1446:       PetscLogGpuTimeBegin();
1447:     #if defined(H2OPUS_USE_MPI)
1448:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);

1450:       if (a->resize) {
1451:         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1452:         delete a->dist_hmatrix_gpu;
1453:         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1454:       }
1455:     #endif
1456:       PetscLogGpuTimeEnd();
1457:   #endif
1458:     }
1459:   } else {
1460:   #if defined(H2OPUS_USE_MPI)
1461:     h2opusHandle_t handle = a->handle->handle;
1462:   #else
1463:     h2opusHandle_t handle = a->handle;
1464:   #endif
1465:     if (boundtocpu) {
1467:       hcompress(*a->hmatrix, tol, handle);

1469:       if (a->resize) {
1470:         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1471:         delete a->hmatrix;
1472:         a->hmatrix = hmatrix;
1473:       }
1474:   #if defined(PETSC_H2OPUS_USE_GPU)
1475:       A->offloadmask = PETSC_OFFLOAD_CPU;
1476:     } else {
1478:       PetscLogGpuTimeBegin();
1479:       hcompress(*a->hmatrix_gpu, tol, handle);
1480:       PetscLogGpuTimeEnd();

1482:       if (a->resize) {
1483:         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1484:         delete a->hmatrix_gpu;
1485:         a->hmatrix_gpu = hmatrix_gpu;
1486:       }
1487:   #endif
1488:     }
1489:   }
1490:   { /* log flops */
1491:     double gops, time, perf, dev;
1492:     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1493:   #if defined(PETSC_H2OPUS_USE_GPU)
1494:     if (boundtocpu) {
1495:       PetscLogFlops(1e9 * gops);
1496:     } else {
1497:       PetscLogGpuFlops(1e9 * gops);
1498:     }
1499:   #else
1500:     PetscLogFlops(1e9 * gops);
1501:   #endif
1502:   }
1503:   PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0);
1504:   return 0;
1505: }

1507: /*@C
1508:      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.

1510:    Input Parameters:
1511: +     A - the hierarchical matrix
1512: .     B - the matrix to be sampled
1513: .     bs - maximum number of samples to be taken concurrently
1514: -     tol - relative tolerance for construction

1516:    Notes:
1517:    You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.

1519:    Level: intermediate

1521: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1522: @*/
1523: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1524: {
1525:   PetscBool ish2opus;

1532:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1533:   if (ish2opus) {
1534:     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1536:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1537:     a->sampler->SetSamplingMat(B);
1538:     if (bs > 0) a->bs = bs;
1539:     if (tol > 0.) a->rtol = tol;
1540:     delete a->kernel;
1541:   }
1542:   return 0;
1543: }

1545: /*@C
1546:      MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.

1548:    Input Parameters:
1549: +     comm - MPI communicator
1550: .     m - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
1551: .     n - number of local columns (or `PETSC_DECIDE` to have calculated if N is given)
1552: .     M - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
1553: .     N - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
1554: .     spacedim - dimension of the space coordinates
1555: .     coords - coordinates of the points
1556: .     cdist - whether or not coordinates are distributed
1557: .     kernel - computational kernel (or NULL)
1558: .     kernelctx - kernel context
1559: .     eta - admissibility condition tolerance
1560: .     leafsize - leaf size in cluster tree
1561: -     basisord - approximation order for Chebychev interpolation of low-rank blocks

1563:    Output Parameter:
1564: .     nA - matrix

1566:    Options Database Keys:
1567: +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1568: .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1569: .     -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1570: -     -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms

1572:    Level: intermediate

1574: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1575: @*/
1576: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1577: {
1578:   Mat         A;
1579:   Mat_H2OPUS *h2opus;
1580:   #if defined(PETSC_H2OPUS_USE_GPU)
1581:   PetscBool iscpu = PETSC_FALSE;
1582:   #else
1583:   PetscBool iscpu = PETSC_TRUE;
1584:   #endif

1587:   MatCreate(comm, &A);
1588:   MatSetSizes(A, m, n, M, N);
1590:   MatSetType(A, MATH2OPUS);
1591:   MatBindToCPU(A, iscpu);
1592:   MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx);

1594:   h2opus = (Mat_H2OPUS *)A->data;
1595:   if (eta > 0.) h2opus->eta = eta;
1596:   if (leafsize > 0) h2opus->leafsize = leafsize;
1597:   if (basisord > 0) h2opus->basisord = basisord;

1599:   *nA = A;
1600:   return 0;
1601: }

1603: /*@C
1604:      MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.

1606:    Input Parameters:
1607: +     B - the matrix to be sampled
1608: .     spacedim - dimension of the space coordinates
1609: .     coords - coordinates of the points
1610: .     cdist - whether or not coordinates are distributed
1611: .     eta - admissibility condition tolerance
1612: .     leafsize - leaf size in cluster tree
1613: .     maxrank - maximum rank allowed
1614: .     bs - maximum number of samples to be taken concurrently
1615: -     rtol - relative tolerance for construction

1617:    Output Parameter:
1618: .     nA - matrix

1620:    Options Database Keys:
1621: +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1622: .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1623: .     -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1624: .     -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1625: .     -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1626: .     -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1627: .     -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1628: -     -mat_h2opus_normsamples <`PetscInt`> - Maximum bumber of samples to be when estimating norms

1630:    Note:
1631:    Not available in parallel

1633:    Level: intermediate

1635: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1636: @*/
1637: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1638: {
1639:   Mat         A;
1640:   Mat_H2OPUS *h2opus;
1641:   MPI_Comm    comm;
1642:   PetscBool   boundtocpu = PETSC_TRUE;

1653:   PetscObjectGetComm((PetscObject)B, &comm);
1656:   MatCreate(comm, &A);
1657:   MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N);
1658:   #if defined(PETSC_H2OPUS_USE_GPU)
1659:   {
1660:     PetscBool iscuda;
1661:     VecType   vtype;

1663:     MatGetVecType(B, &vtype);
1664:     PetscStrcmp(vtype, VECCUDA, &iscuda);
1665:     if (!iscuda) {
1666:       PetscStrcmp(vtype, VECSEQCUDA, &iscuda);
1667:       if (!iscuda) PetscStrcmp(vtype, VECMPICUDA, &iscuda);
1668:     }
1669:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1670:   }
1671:   #endif
1672:   MatSetType(A, MATH2OPUS);
1673:   MatBindToCPU(A, boundtocpu);
1674:   if (spacedim) MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL);
1675:   MatPropagateSymmetryOptions(B, A);

1678:   h2opus          = (Mat_H2OPUS *)A->data;
1679:   h2opus->sampler = new PetscMatrixSampler(B);
1680:   if (eta > 0.) h2opus->eta = eta;
1681:   if (leafsize > 0) h2opus->leafsize = leafsize;
1682:   if (maxrank > 0) h2opus->max_rank = maxrank;
1683:   if (bs > 0) h2opus->bs = bs;
1684:   if (rtol > 0.) h2opus->rtol = rtol;
1685:   *nA             = A;
1686:   A->preallocated = PETSC_TRUE;
1687:   return 0;
1688: }

1690: /*@C
1691:      MatH2OpusGetIndexMap - Access reordering index set.

1693:    Input Parameters:
1694: .     A - the matrix

1696:    Output Parameter:
1697: .     indexmap - the index set for the reordering

1699:    Level: intermediate

1701: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1702: @*/
1703: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1704: {
1705:   PetscBool   ish2opus;
1706:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1712:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1714:   *indexmap = a->h2opus_indexmap;
1715:   return 0;
1716: }

1718: /*@C
1719:      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1721:    Input Parameters:
1722: +     A - the matrix
1723: .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1724: -     in - the vector to be mapped

1726:    Output Parameter:
1727: .     out - the newly created mapped vector

1729:    Level: intermediate

1731: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1732: @*/
1733: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1734: {
1735:   PetscBool    ish2opus;
1736:   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1737:   PetscScalar *xin, *xout;
1738:   PetscBool    nm;

1746:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus);
1748:   nm = a->nativemult;
1749:   MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc);
1750:   MatCreateVecs(A, out, NULL);
1751:   MatH2OpusSetNativeMult(A, nm);
1752:   if (!a->sf) { /* same ordering */
1753:     VecCopy(in, *out);
1754:     return 0;
1755:   }
1756:   VecGetArrayRead(in, (const PetscScalar **)&xin);
1757:   VecGetArrayWrite(*out, &xout);
1758:   if (nativetopetsc) {
1759:     PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1760:     PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1761:   } else {
1762:     PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1763:     PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE);
1764:   }
1765:   VecRestoreArrayRead(in, (const PetscScalar **)&xin);
1766:   VecRestoreArrayWrite(*out, &xout);
1767:   return 0;
1768: }

1770: /*@C
1771:      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T

1773:    Input Parameters:
1774: +     A - the hierarchical `MATH2OPUS` matrix
1775: .     s - the scaling factor
1776: .     U - the dense low-rank update matrix
1777: -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)

1779:    Note:
1780:    The U and V matrices must be in dense format

1782:    Level: intermediate

1784: .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1785: @*/
1786: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1787: {
1788:   PetscBool flg;

1795:   if (V) {
1798:   }

1801:   if (!V) V = U;
1803:   if (!U->cmap->N) return 0;
1804:   PetscLayoutCompare(U->rmap, A->rmap, &flg);
1806:   PetscLayoutCompare(V->rmap, A->cmap, &flg);
1808:   PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg);
1809:   if (flg) {
1810:     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1811:     const PetscScalar *u, *v, *uu, *vv;
1812:     PetscInt           ldu, ldv;
1813:     PetscMPIInt        size;
1814:   #if defined(H2OPUS_USE_MPI)
1815:     h2opusHandle_t handle = a->handle->handle;
1816:   #else
1817:     h2opusHandle_t handle = a->handle;
1818:   #endif
1819:     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1820:     PetscSF   usf, vsf;

1822:     MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1824:     PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0);
1825:     PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "");
1827:     PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "");
1829:     MatDenseGetLDA(U, &ldu);
1830:     MatDenseGetLDA(V, &ldv);
1831:     MatBoundToCPU(A, &flg);
1832:     if (usesf) {
1833:       PetscInt n;

1835:       MatDenseGetH2OpusVectorSF(U, a->sf, &usf);
1836:       MatDenseGetH2OpusVectorSF(V, a->sf, &vsf);
1837:       MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N);
1838:       PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL);
1839:       ldu = n;
1840:       ldv = n;
1841:     }
1842:     if (flg) {
1844:       MatDenseGetArrayRead(U, &u);
1845:       MatDenseGetArrayRead(V, &v);
1846:       if (usesf) {
1847:         vv = MatH2OpusGetThrustPointer(*a->yy);
1848:         PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1849:         PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1850:         if (U != V) {
1851:           uu = MatH2OpusGetThrustPointer(*a->xx);
1852:           PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1853:           PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1854:         } else uu = vv;
1855:       } else {
1856:         uu = u;
1857:         vv = v;
1858:       }
1859:       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1860:       MatDenseRestoreArrayRead(U, &u);
1861:       MatDenseRestoreArrayRead(V, &v);
1862:     } else {
1863:   #if defined(PETSC_H2OPUS_USE_GPU)
1864:       PetscBool flgU, flgV;

1867:       PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "");
1868:       if (flgU) MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U);
1869:       PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "");
1870:       if (flgV) MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V);
1871:       MatDenseCUDAGetArrayRead(U, &u);
1872:       MatDenseCUDAGetArrayRead(V, &v);
1873:       if (usesf) {
1874:         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1875:         PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1876:         PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE);
1877:         if (U != V) {
1878:           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1879:           PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1880:           PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE);
1881:         } else uu = vv;
1882:       } else {
1883:         uu = u;
1884:         vv = v;
1885:       }
1886:   #else
1887:       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1888:   #endif
1889:       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1890:   #if defined(PETSC_H2OPUS_USE_GPU)
1891:       MatDenseCUDARestoreArrayRead(U, &u);
1892:       MatDenseCUDARestoreArrayRead(V, &v);
1893:       if (flgU) MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U);
1894:       if (flgV) MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V);
1895:   #endif
1896:     }
1897:     PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0);
1898:     a->orthogonal = PETSC_FALSE;
1899:   }
1900:   return 0;
1901: }
1902: #endif