Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4: #include <h2opus.h>
  5: #if defined(H2OPUS_USE_MPI)
  6: #include <h2opus/distributed/distributed_h2opus_handle.h>
  7: #include <h2opus/distributed/distributed_geometric_construction.h>
  8: #include <h2opus/distributed/distributed_hgemv.h>
  9: #include <h2opus/distributed/distributed_horthog.h>
 10: #include <h2opus/distributed/distributed_hcompress.h>
 11: #endif
 12: #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
 20: PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
 21: PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
 22: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);

 24: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 26: /* Use GPU only if H2OPUS is configured for GPU */
 27: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 28: #define PETSC_H2OPUS_USE_GPU
 29: #endif
 30: #if defined(PETSC_H2OPUS_USE_GPU)
 31: #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
 32: #else
 33: #define MatH2OpusUpdateIfNeeded(A,B) 0
 34: #endif

 36: // TODO H2OPUS:
 37: // DistributedHMatrix
 38: //   unsymmetric ?
 39: //   transpose for distributed_hgemv?
 40: //   clearData()
 41: // Unify interface for sequential and parallel?
 42: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 43: //
 44: template <class T> class PetscPointCloud : public H2OpusDataSet<T>
 45: {
 46:   private:
 47:     int dimension;
 48:     size_t num_points;
 49:     std::vector<T> pts;

 51:   public:
 52:     PetscPointCloud(int dim, size_t num_pts, const T coords[])
 53:     {
 54:       this->dimension = dim;
 55:       this->num_points = num_pts;

 57:       pts.resize(num_pts*dim);
 58:       if (coords) {
 59:         for (size_t n = 0; n < num_points; n++)
 60:           for (int i = 0; i < dim; i++)
 61:             pts[n*dim + i] = coords[n*dim + i];
 62:       } else {
 63:         PetscReal h = 1./(num_points - 1);
 64:         for (size_t n = 0; n < num_points; n++)
 65:           for (int i = 0; i < dim; i++)
 66:             pts[n*dim + i] = i*h;
 67:       }
 68:     }

 70:     PetscPointCloud(const PetscPointCloud<T>& other)
 71:     {
 72:       size_t N = other.dimension * other.num_points;
 73:       this->dimension = other.dimension;
 74:       this->num_points = other.num_points;
 75:       this->pts.resize(N);
 76:       for (size_t i = 0; i < N; i++)
 77:         this->pts[i] = other.pts[i];
 78:     }

 80:     int getDimension() const
 81:     {
 82:         return dimension;
 83:     }

 85:     size_t getDataSetSize() const
 86:     {
 87:         return num_points;
 88:     }

 90:     T getDataPoint(size_t idx, int dim) const
 91:     {
 92:         assert(dim < dimension && idx < num_points);
 93:         return pts[idx*dimension + dim];
 94:     }

 96:     void Print(std::ostream& out = std::cout)
 97:     {
 98:        out << "Dimension: " << dimension << std::endl;
 99:        out << "NumPoints: " << num_points << std::endl;
100:        for (size_t n = 0; n < num_points; n++) {
101:          for (int d = 0; d < dimension; d++)
102:            out << pts[n*dimension + d] << " ";
103:          out << std::endl;
104:        }
105:     }
106: };

108: template<class T> class PetscFunctionGenerator
109: {
110: private:
111:   MatH2OpusKernel k;
112:   int             dim;
113:   void            *ctx;

115: public:
116:     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
117:     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
118:     T operator()(PetscReal *pt1, PetscReal *pt2)
119:     {
120:         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
121:     }
122: };

124: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

126: /* just to not clutter the code */
127: #if !defined(H2OPUS_USE_GPU)
128: typedef HMatrix HMatrix_GPU;
129: #if defined(H2OPUS_USE_MPI)
130: typedef DistributedHMatrix DistributedHMatrix_GPU;
131: #endif
132: #endif

134: typedef struct {
135: #if defined(H2OPUS_USE_MPI)
136:   distributedH2OpusHandle_t handle;
137: #else
138:   h2opusHandle_t handle;
139: #endif
140:   /* Sequential and parallel matrices are two different classes at the moment */
141:   HMatrix *hmatrix;
142: #if defined(H2OPUS_USE_MPI)
143:   DistributedHMatrix *dist_hmatrix;
144: #else
145:   HMatrix *dist_hmatrix; /* just to not clutter the code */
146: #endif
147:   /* May use permutations */
148:   PetscSF sf;
149:   PetscLayout h2opus_rmap, h2opus_cmap;
150:   IS h2opus_indexmap;
151:   thrust::host_vector<PetscScalar> *xx,*yy;
152:   PetscInt xxs,yys;
153:   PetscBool multsetup;

155:   /* GPU */
156:   HMatrix_GPU *hmatrix_gpu;
157: #if defined(H2OPUS_USE_MPI)
158:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
159: #else
160:   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161: #endif
162: #if defined(PETSC_H2OPUS_USE_GPU)
163:   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
164:   PetscInt xxs_gpu,yys_gpu;
165: #endif

167:   /* construction from matvecs */
168:   PetscMatrixSampler* sampler;
169:   PetscBool nativemult;

171:   /* Admissibility */
172:   PetscReal eta;
173:   PetscInt  leafsize;

175:   /* for dof reordering */
176:   PetscPointCloud<PetscReal> *ptcloud;

178:   /* kernel for generating matrix entries */
179:   PetscFunctionGenerator<PetscScalar> *kernel;

181:   /* basis orthogonalized? */
182:   PetscBool orthogonal;

184:   /* customization */
185:   PetscInt  basisord;
186:   PetscInt  max_rank;
187:   PetscInt  bs;
188:   PetscReal rtol;
189:   PetscInt  norm_max_samples;
190:   PetscBool check_construction;
191:   PetscBool hara_verbose;

193:   /* keeps track of MatScale values */
194:   PetscScalar s;
195: } Mat_H2OPUS;

197: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
198: {
199:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

203: #if defined(H2OPUS_USE_MPI)
204:   h2opusDestroyDistributedHandle(a->handle);
205: #else
206:   h2opusDestroyHandle(a->handle);
207: #endif
208:   delete a->dist_hmatrix;
209:   delete a->hmatrix;
210:   PetscSFDestroy(&a->sf);
211:   PetscLayoutDestroy(&a->h2opus_rmap);
212:   PetscLayoutDestroy(&a->h2opus_cmap);
213:   ISDestroy(&a->h2opus_indexmap);
214:   delete a->xx;
215:   delete a->yy;
216:   delete a->hmatrix_gpu;
217:   delete a->dist_hmatrix_gpu;
218: #if defined(PETSC_H2OPUS_USE_GPU)
219:   delete a->xx_gpu;
220:   delete a->yy_gpu;
221: #endif
222:   delete a->sampler;
223:   delete a->ptcloud;
224:   delete a->kernel;
225:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL);
226:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL);
227:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL);
228:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL);
229:   PetscObjectChangeTypeName((PetscObject)A,NULL);
230:   PetscFree(A->data);
231:   return(0);
232: }

234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
237:   PetscBool      ish2opus;

243:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
244:   if (ish2opus) {
245:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247:         PetscLayout t;
248:         t = A->rmap;
249:         A->rmap = a->h2opus_rmap;
250:         a->h2opus_rmap = t;
251:         t = A->cmap;
252:         A->cmap = a->h2opus_cmap;
253:         a->h2opus_cmap = t;
254:       }
255:     }
256:     a->nativemult = nm;
257:   }
258:   return(0);
259: }

261: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262: {
263:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
264:   PetscBool      ish2opus;

270:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
271:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
272:   *nm = a->nativemult;
273:   return(0);
274: }

276: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
277: {
279:   PetscBool      ish2opus;
280:   PetscInt       nmax = PETSC_DECIDE;
281:   Mat_H2OPUS     *a = NULL;
282:   PetscBool      mult = PETSC_FALSE;

285:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
286:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
287:     a = (Mat_H2OPUS*)A->data;

289:     nmax = a->norm_max_samples;
290:     mult = a->nativemult;
291:     MatH2OpusSetNativeMult(A,PETSC_TRUE);
292:   } else {
293:     PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL);
294:   }
295:   MatApproximateNorm_Private(A,normtype,nmax,n);
296:   if (a) { MatH2OpusSetNativeMult(A,mult); }
297:   return(0);
298: }

300: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
301: {
302:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
303: #if defined(H2OPUS_USE_MPI)
304:   h2opusHandle_t handle = h2opus->handle->handle;
305: #else
306:   h2opusHandle_t handle = h2opus->handle;
307: #endif
308:   PetscBool      boundtocpu = PETSC_TRUE;
309:   PetscScalar    *xx,*yy,*uxx,*uyy;
310:   PetscInt       blda,clda;
311:   PetscMPIInt    size;
312:   PetscSF        bsf,csf;
313:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

317:   HLibProfile::clear();
318: #if defined(PETSC_H2OPUS_USE_GPU)
319:   boundtocpu = A->boundtocpu;
320: #endif
321:   MatDenseGetLDA(B,&blda);
322:   MatDenseGetLDA(C,&clda);
323:   if (usesf) {
324:     PetscInt n;

326:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
327:     PetscObjectQuery((PetscObject)B,"_math2opus_vectorsf",(PetscObject*)&bsf);
328:     if (!bsf) {
329:       PetscSFGetVectorSF(h2opus->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);
330:       PetscObjectCompose((PetscObject)B,"_math2opus_vectorsf",(PetscObject)bsf);
331:       PetscObjectDereference((PetscObject)bsf);
332:     }
333:     PetscObjectQuery((PetscObject)C,"_math2opus_vectorsf",(PetscObject*)&csf);
334:     if (!csf) {
335:       PetscSFGetVectorSF(h2opus->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);
336:       PetscObjectCompose((PetscObject)C,"_math2opus_vectorsf",(PetscObject)csf);
337:       PetscObjectDereference((PetscObject)csf);
338:     }
339:     blda = n;
340:     clda = n;
341:   }
342:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
343:   if (boundtocpu) {
344:     if (usesf) {
345:       PetscInt n;

347:       PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
348:       if (h2opus->xxs < B->cmap->n) { h2opus->xx->resize(n*B->cmap->N); h2opus->xxs = B->cmap->N; }
349:       if (h2opus->yys < B->cmap->n) { h2opus->yy->resize(n*B->cmap->N); h2opus->yys = B->cmap->N; }
350:     }
351:     MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
352:     MatDenseGetArrayWrite(C,&yy);
353:     if (usesf) {
354:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
355:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
356:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
357:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
358:     } else {
359:       uxx = xx;
360:       uyy = yy;
361:     }
362:     if (size > 1) {
363:       if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
364:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
365: #if defined(H2OPUS_USE_MPI)
366:       distributed_hgemv(/*transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
367: #endif
368:     } else {
369:       if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
370:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
371:     }
372:     MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
373:     if (usesf) {
374:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
375:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
376:     }
377:     MatDenseRestoreArrayWrite(C,&yy);
378: #if defined(PETSC_H2OPUS_USE_GPU)
379:   } else {
380:     PetscBool ciscuda,biscuda;

382:     if (usesf) {
383:       PetscInt n;

385:       PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
386:       if (h2opus->xxs_gpu < B->cmap->n) { h2opus->xx_gpu->resize(n*B->cmap->N); h2opus->xxs_gpu = B->cmap->N; }
387:       if (h2opus->yys_gpu < B->cmap->n) { h2opus->yy_gpu->resize(n*B->cmap->N); h2opus->yys_gpu = B->cmap->N; }
388:     }
389:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
390:     PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
391:     if (!biscuda) {
392:       MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
393:     }
394:     PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
395:     if (!ciscuda) {
396:       C->assembled = PETSC_TRUE;
397:       MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
398:     }
399:     MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
400:     MatDenseCUDAGetArrayWrite(C,&yy);
401:     if (usesf) {
402:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
403:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
404:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
405:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
406:     } else {
407:       uxx = xx;
408:       uyy = yy;
409:     }
410:     PetscLogGpuTimeBegin();
411:     if (size > 1) {
412:       if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
413:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
414: #if defined(H2OPUS_USE_MPI)
415:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
416: #endif
417:     } else {
418:       if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
419:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
420:     }
421:     PetscLogGpuTimeEnd();
422:     MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
423:     if (usesf) {
424:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
425:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
426:     }
427:     MatDenseCUDARestoreArrayWrite(C,&yy);
428:     if (!biscuda) {
429:       MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
430:     }
431:     if (!ciscuda) {
432:       MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
433:     }
434: #endif
435:   }
436:   { /* log flops */
437:     double gops,time,perf,dev;
438:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
439: #if defined(PETSC_H2OPUS_USE_GPU)
440:     if (boundtocpu) {
441:       PetscLogFlops(1e9*gops);
442:     } else {
443:       PetscLogGpuFlops(1e9*gops);
444:     }
445: #else
446:     PetscLogFlops(1e9*gops);
447: #endif
448:   }
449:   return(0);
450: }

452: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
453: {
454:   Mat_Product    *product = C->product;

458:   MatCheckProduct(C,1);
459:   switch (product->type) {
460:   case MATPRODUCT_AB:
461:     MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C);
462:     break;
463:   case MATPRODUCT_AtB:
464:     MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C);
465:     break;
466:   default:
467:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
468:   }
469:   return(0);
470: }

472: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
473: {
475:   Mat_Product    *product = C->product;
476:   PetscBool      cisdense;
477:   Mat            A,B;

480:   MatCheckProduct(C,1);
481:   A = product->A;
482:   B = product->B;
483:   switch (product->type) {
484:   case MATPRODUCT_AB:
485:     MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
486:     MatSetBlockSizesFromMats(C,product->A,product->B);
487:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
488:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
489:     MatSetUp(C);
490:     break;
491:   case MATPRODUCT_AtB:
492:     MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
493:     MatSetBlockSizesFromMats(C,product->A,product->B);
494:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
495:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
496:     MatSetUp(C);
497:     break;
498:   default:
499:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
500:   }
501:   C->ops->productsymbolic = NULL;
502:   C->ops->productnumeric = MatProductNumeric_H2OPUS;
503:   return(0);
504: }

506: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
507: {
509:   MatCheckProduct(C,1);
510:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
511:     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
512:   }
513:   return(0);
514: }

516: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517: {
518:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
519: #if defined(H2OPUS_USE_MPI)
520:   h2opusHandle_t handle = h2opus->handle->handle;
521: #else
522:   h2opusHandle_t handle = h2opus->handle;
523: #endif
524:   PetscBool      boundtocpu = PETSC_TRUE;
525:   PetscInt       n;
526:   PetscScalar    *xx,*yy,*uxx,*uyy;
527:   PetscMPIInt    size;
528:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

532:   HLibProfile::clear();
533:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
534: #if defined(PETSC_H2OPUS_USE_GPU)
535:   boundtocpu = A->boundtocpu;
536: #endif
537:   if (usesf) {
538:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
539:   } else n = A->rmap->n;
540:   if (boundtocpu) {
541:     VecGetArrayRead(x,(const PetscScalar**)&xx);
542:     if (sy == 0.0) {
543:       VecGetArrayWrite(y,&yy);
544:     } else {
545:       VecGetArray(y,&yy);
546:     }
547:     if (usesf) {
548:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
549:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

551:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
552:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
553:       if (sy != 0.0) {
554:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
555:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
556:       }
557:     } else {
558:       uxx = xx;
559:       uyy = yy;
560:     }
561:     if (size > 1) {
562:       if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
563:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
564: #if defined(H2OPUS_USE_MPI)
565:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
566: #endif
567:     } else {
568:       if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
569:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
570:     }
571:     VecRestoreArrayRead(x,(const PetscScalar**)&xx);
572:     if (usesf) {
573:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
574:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
575:     }
576:     if (sy == 0.0) {
577:       VecRestoreArrayWrite(y,&yy);
578:     } else {
579:       VecRestoreArray(y,&yy);
580:     }
581: #if defined(PETSC_H2OPUS_USE_GPU)
582:   } else {
583:     VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
584:     if (sy == 0.0) {
585:       VecCUDAGetArrayWrite(y,&yy);
586:     } else {
587:       VecCUDAGetArray(y,&yy);
588:     }
589:     if (usesf) {
590:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
591:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

593:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
594:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
595:       if (sy != 0.0) {
596:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
597:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
598:       }
599:     } else {
600:       uxx = xx;
601:       uyy = yy;
602:     }
603:     PetscLogGpuTimeBegin();
604:     if (size > 1) {
605:       if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
606:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
607: #if defined(H2OPUS_USE_MPI)
608:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
609: #endif
610:     } else {
611:       if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
612:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
613:     }
614:     PetscLogGpuTimeEnd();
615:     VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
616:     if (usesf) {
617:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
618:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
619:     }
620:     if (sy == 0.0) {
621:       VecCUDARestoreArrayWrite(y,&yy);
622:     } else {
623:       VecCUDARestoreArray(y,&yy);
624:     }
625: #endif
626:   }
627:   { /* log flops */
628:     double gops,time,perf,dev;
629:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
630: #if defined(PETSC_H2OPUS_USE_GPU)
631:     if (boundtocpu) {
632:       PetscLogFlops(1e9*gops);
633:     } else {
634:       PetscLogGpuFlops(1e9*gops);
635:     }
636: #else
637:     PetscLogFlops(1e9*gops);
638: #endif
639:   }
640:   return(0);
641: }

643: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
644: {
645:   PetscBool      xiscuda,yiscuda;

649:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
650:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
651:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
652:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE);
653:   return(0);
654: }

656: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
657: {
658:   PetscBool      xiscuda,yiscuda;

662:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
663:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
664:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
665:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE);
666:   return(0);
667: }

669: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
670: {
671:   PetscBool      xiscuda,ziscuda;

675:   VecCopy(y,z);
676:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
677:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
678:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
679:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE);
680:   return(0);
681: }

683: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
684: {
685:   PetscBool      xiscuda,ziscuda;

689:   VecCopy(y,z);
690:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
691:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
692:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
693:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE);
694:   return(0);
695: }

697: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
698: {
699:   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

702:   a->s *= s;
703:   return(0);
704: }

706: static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
707: {
708:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

712:   PetscOptionsHead(PetscOptionsObject,"H2OPUS options");
713:   PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL);
714:   PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
715:   PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
716:   PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
717:   PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
718:   PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL);
719:   PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
720:   PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
721:   PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL);
722:   PetscOptionsTail();
723:   return(0);
724: }

726: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);

728: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
729: {
730:   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
731:   Vec               c;
732:   PetscInt          spacedim;
733:   const PetscScalar *coords;
734:   PetscErrorCode    ierr;

737:   if (a->ptcloud) return(0);
738:   PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c);
739:   if (!c && a->sampler) {
740:     Mat S = a->sampler->GetSamplingMat();

742:     PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c);
743:   }
744:   if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
745:   VecGetArrayRead(c,&coords);
746:   VecGetBlockSize(c,&spacedim);
747:   MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL);
748:   VecRestoreArrayRead(c,&coords);
749:   return(0);
750: }

752: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
753: {
754:   MPI_Comm       comm;
755:   PetscMPIInt    size;
757:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
758:   PetscInt       n = 0,*idx = NULL;
759:   int            *iidx = NULL;
760:   PetscCopyMode  own;
761:   PetscBool      rid;

764:   if (a->multsetup) return(0);
765:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
766:     PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
767: #if defined(PETSC_H2OPUS_USE_GPU)
768:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
769:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
770:     a->xxs_gpu = 1;
771:     a->yys_gpu = 1;
772: #endif
773:     a->xx  = new thrust::host_vector<PetscScalar>(n);
774:     a->yy  = new thrust::host_vector<PetscScalar>(n);
775:     a->xxs = 1;
776:     a->yys = 1;
777:   } else {
778:     IS is;
779:     PetscObjectGetComm((PetscObject)A,&comm);
780:     MPI_Comm_size(comm,&size);
781:     if (!a->h2opus_indexmap) {
782:       if (size > 1) {
783:         if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
784: #if defined(H2OPUS_USE_MPI)
785:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
786:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
787: #endif
788:       } else {
789:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
790:         n    = a->hmatrix->u_basis_tree.index_map.size();
791:       }

793:       if (PetscDefined(USE_64BIT_INDICES)) {
794:         PetscInt i;

796:         own  = PETSC_OWN_POINTER;
797:         PetscMalloc1(n,&idx);
798:         for (i=0;i<n;i++) idx[i] = iidx[i];
799:       } else {
800:         own  = PETSC_COPY_VALUES;
801:         idx  = (PetscInt*)iidx;
802:       }
803:       ISCreateGeneral(comm,n,idx,own,&is);
804:       ISSetPermutation(is);
805:       ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view");
806:       a->h2opus_indexmap = is;
807:     }
808:     ISGetLocalSize(a->h2opus_indexmap,&n);
809:     ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
810:     rid  = (PetscBool)(n == A->rmap->n);
811:     MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm);
812:     if (rid) {
813:       ISIdentity(a->h2opus_indexmap,&rid);
814:     }
815:     if (!rid) {
816:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
817:         PetscLayoutCreate(comm,&a->h2opus_rmap);
818:         PetscLayoutSetLocalSize(a->h2opus_rmap,n);
819:         PetscLayoutSetUp(a->h2opus_rmap);
820:         PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap);
821:       }
822:       PetscSFCreate(comm,&a->sf);
823:       PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
824:       PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view");
825: #if defined(PETSC_H2OPUS_USE_GPU)
826:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
827:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
828:       a->xxs_gpu = 1;
829:       a->yys_gpu = 1;
830: #endif
831:       a->xx  = new thrust::host_vector<PetscScalar>(n);
832:       a->yy  = new thrust::host_vector<PetscScalar>(n);
833:       a->xxs = 1;
834:       a->yys = 1;
835:     }
836:     ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
837:   }
838:   a->multsetup = PETSC_TRUE;
839:   return(0);
840: }

842: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
843: {
844:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
845: #if defined(H2OPUS_USE_MPI)
846:   h2opusHandle_t handle = a->handle->handle;
847: #else
848:   h2opusHandle_t handle = a->handle;
849: #endif
850:   PetscBool      kernel = PETSC_FALSE;
851:   PetscBool      boundtocpu = PETSC_TRUE;
852:   PetscBool      samplingdone = PETSC_FALSE;
853:   MPI_Comm       comm;
854:   PetscMPIInt    size;

858:   PetscObjectGetComm((PetscObject)A,&comm);
859:   if (A->rmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
860:   if (A->rmap->N != A->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
861:   MPI_Comm_size(comm,&size);
862:   /* TODO REUSABILITY of geometric construction */
863:   delete a->hmatrix;
864:   delete a->dist_hmatrix;
865: #if defined(PETSC_H2OPUS_USE_GPU)
866:   delete a->hmatrix_gpu;
867:   delete a->dist_hmatrix_gpu;
868: #endif
869:   a->orthogonal = PETSC_FALSE;

871:   /* TODO: other? */
872:   H2OpusBoxCenterAdmissibility adm(a->eta);

874:   PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0);
875:   if (size > 1) {
876: #if defined(H2OPUS_USE_MPI)
877:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
878: #else
879:     a->dist_hmatrix = NULL;
880: #endif
881:   } else {
882:     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
883:   }
884:   MatH2OpusInferCoordinates_Private(A);
885:   if (!a->ptcloud) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
886:   if (a->kernel) {
887:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
888:     if (size > 1) {
889:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
890: #if defined(H2OPUS_USE_MPI)
891:       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
892: #endif
893:     } else {
894:       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
895:     }
896:     kernel = PETSC_TRUE;
897:   } else {
898:     if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
899:     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
900:   }
901:   MatSetUpMultiply_H2OPUS(A);

903: #if defined(PETSC_H2OPUS_USE_GPU)
904:   boundtocpu = A->boundtocpu;
905:   if (!boundtocpu) {
906:     if (size > 1) {
907:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
908: #if defined(H2OPUS_USE_MPI)
909:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
910: #endif
911:     } else {
912:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
913:     }
914:   }
915: #endif
916:   if (size == 1) {
917:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
918:       PetscReal Anorm;
919:       bool      verbose;

921:       PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL);
922:       verbose = a->hara_verbose;
923:       MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm);
924:       if (a->hara_verbose) { PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs); }
925:       if (a->sf && !a->nativemult) {
926:         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
927:       }
928:       a->sampler->SetStream(handle->getMainStream());
929:       if (boundtocpu) {
930:         a->sampler->SetGPUSampling(false);
931:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
932: #if defined(PETSC_H2OPUS_USE_GPU)
933:       } else {
934:         a->sampler->SetGPUSampling(true);
935:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
936: #endif
937:       }
938:       samplingdone = PETSC_TRUE;
939:     }
940:   }
941: #if defined(PETSC_H2OPUS_USE_GPU)
942:   if (!boundtocpu) {
943:     delete a->hmatrix;
944:     delete a->dist_hmatrix;
945:     a->hmatrix = NULL;
946:     a->dist_hmatrix = NULL;
947:   }
948:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
949: #endif
950:   PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0);

952:   if (!a->s) a->s = 1.0;
953:   A->assembled = PETSC_TRUE;

955:   if (samplingdone) {
956:     PetscBool check = a->check_construction;
957:     PetscBool checke = PETSC_FALSE;

959:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL);
960:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL);
961:     if (check) {
962:       Mat       E,Ae;
963:       PetscReal n1,ni,n2;
964:       PetscReal n1A,niA,n2A;
965:       void      (*normfunc)(void);

967:       Ae   = a->sampler->GetSamplingMat();
968:       MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
969:       MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
970:       MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
971:       MatNorm(E,NORM_1,&n1);
972:       MatNorm(E,NORM_INFINITY,&ni);
973:       MatNorm(E,NORM_2,&n2);
974:       if (checke) {
975:         Mat eA,eE,eAe;

977:         MatComputeOperator(A,MATAIJ,&eA);
978:         MatComputeOperator(E,MATAIJ,&eE);
979:         MatComputeOperator(Ae,MATAIJ,&eAe);
980:         MatChop(eA,PETSC_SMALL);
981:         MatChop(eE,PETSC_SMALL);
982:         MatChop(eAe,PETSC_SMALL);
983:         PetscObjectSetName((PetscObject)eA,"H2Mat");
984:         MatView(eA,NULL);
985:         PetscObjectSetName((PetscObject)eAe,"S");
986:         MatView(eAe,NULL);
987:         PetscObjectSetName((PetscObject)eE,"H2Mat - S");
988:         MatView(eE,NULL);
989:         MatDestroy(&eA);
990:         MatDestroy(&eE);
991:         MatDestroy(&eAe);
992:       }

994:       MatGetOperation(Ae,MATOP_NORM,&normfunc);
995:       MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
996:       MatNorm(Ae,NORM_1,&n1A);
997:       MatNorm(Ae,NORM_INFINITY,&niA);
998:       MatNorm(Ae,NORM_2,&n2A);
999:       n1A  = PetscMax(n1A,PETSC_SMALL);
1000:       n2A  = PetscMax(n2A,PETSC_SMALL);
1001:       niA  = PetscMax(niA,PETSC_SMALL);
1002:       MatSetOperation(Ae,MATOP_NORM,normfunc);
1003:       PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
1004:       MatDestroy(&E);
1005:     }
1006:   }
1007:   return(0);
1008: }

1010: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1011: {
1013:   PetscMPIInt    size;
1014:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1017:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1018:   if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1019:   else {
1020:     a->hmatrix->clearData();
1021: #if defined(PETSC_H2OPUS_USE_GPU)
1022:     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1023: #endif
1024:   }
1025:   return(0);
1026: }

1028: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1029: {
1030:   Mat            A;
1031:   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1032: #if defined(PETSC_H2OPUS_USE_GPU)
1033:   PetscBool      iscpu = PETSC_FALSE;
1034: #else
1035:   PetscBool      iscpu = PETSC_TRUE;
1036: #endif
1038:   MPI_Comm       comm;

1041:   PetscObjectGetComm((PetscObject)B,&comm);
1042:   MatCreate(comm,&A);
1043:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1044:   MatSetType(A,MATH2OPUS);
1045:   MatPropagateSymmetryOptions(B,A);
1046:   a = (Mat_H2OPUS*)A->data;

1048:   a->eta              = b->eta;
1049:   a->leafsize         = b->leafsize;
1050:   a->basisord         = b->basisord;
1051:   a->max_rank         = b->max_rank;
1052:   a->bs               = b->bs;
1053:   a->rtol             = b->rtol;
1054:   a->norm_max_samples = b->norm_max_samples;
1055:   if (op == MAT_COPY_VALUES) a->s = b->s;

1057:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1058:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1060: #if defined(H2OPUS_USE_MPI)
1061:   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1062: #if defined(PETSC_H2OPUS_USE_GPU)
1063:   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1064: #endif
1065: #endif
1066:   if (b->hmatrix) {
1067:     a->hmatrix = new HMatrix(*b->hmatrix);
1068:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1069:   }
1070: #if defined(PETSC_H2OPUS_USE_GPU)
1071:   if (b->hmatrix_gpu) {
1072:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1073:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1074:   }
1075: #endif
1076:   if (b->sf) {
1077:     PetscObjectReference((PetscObject)b->sf);
1078:     a->sf = b->sf;
1079:   }
1080:   if (b->h2opus_indexmap) {
1081:     PetscObjectReference((PetscObject)b->h2opus_indexmap);
1082:     a->h2opus_indexmap = b->h2opus_indexmap;
1083:   }

1085:   MatSetUp(A);
1086:   MatSetUpMultiply_H2OPUS(A);
1087:   if (op == MAT_COPY_VALUES) {
1088:     A->assembled = PETSC_TRUE;
1089:     a->orthogonal = b->orthogonal;
1090: #if defined(PETSC_H2OPUS_USE_GPU)
1091:     A->offloadmask = B->offloadmask;
1092: #endif
1093:   }
1094: #if defined(PETSC_H2OPUS_USE_GPU)
1095:   iscpu = B->boundtocpu;
1096: #endif
1097:   MatBindToCPU(A,iscpu);

1099:   *nA = A;
1100:   return(0);
1101: }

1103: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1104: {
1105:   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1106:   PetscBool         isascii;
1107:   PetscErrorCode    ierr;
1108:   PetscMPIInt       size;
1109:   PetscViewerFormat format;

1112:   PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1113:   PetscViewerGetFormat(view,&format);
1114:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1115:   if (isascii) {
1116:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1117:       if (size == 1) {
1118:         FILE *fp;
1119:         PetscViewerASCIIGetPointer(view,&fp);
1120:         dumpHMatrix(*h2opus->hmatrix,6,fp);
1121:       }
1122:     } else {
1123:       PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat");
1124:       PetscViewerASCIIPrintf(view,"  PointCloud dim %D\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1125:       PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %D, eta %g\n",h2opus->leafsize,(double)h2opus->eta);
1126:       if (!h2opus->kernel) {
1127:         PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %D, samples %D, tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol);
1128:       } else {
1129:         PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %D\n",h2opus->basisord);
1130:       }
1131:       PetscViewerASCIIPrintf(view,"  Number of samples for norms %D\n",h2opus->norm_max_samples);
1132:       if (size == 1) {
1133:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1134:         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1135: #if defined(PETSC_HAVE_CUDA)
1136:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1137:         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1138: #endif
1139:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1140: #if defined(PETSC_HAVE_CUDA)
1141:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1142: #endif
1143:       } else {
1144: #if defined(PETSC_HAVE_CUDA)
1145:         double      matrix_mem[4] = {0.,0.,0.,0.};
1146:         PetscMPIInt rsize = 4;
1147: #else
1148:         double      matrix_mem[2] = {0.,0.};
1149:         PetscMPIInt rsize = 2;
1150: #endif
1151: #if defined(H2OPUS_USE_MPI)
1152:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1153:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1154: #if defined(PETSC_HAVE_CUDA)
1155:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1156:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1157: #endif
1158: #endif
1159:         MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A));
1160:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1161: #if defined(PETSC_HAVE_CUDA)
1162:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1163: #endif
1164:       }
1165:     }
1166:   }
1167: #if 0
1168:   if (size == 1) {
1169:     char filename[256];
1170:     const char *name;

1172:     PetscObjectGetName((PetscObject)A,&name);
1173:     PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name);
1174:     outputEps(*h2opus->hmatrix,filename);
1175:   }
1176: #endif
1177:   return(0);
1178: }

1180: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1181: {
1182:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1183:   PetscReal      *gcoords;
1184:   PetscInt       N;
1185:   MPI_Comm       comm;
1186:   PetscMPIInt    size;
1187:   PetscBool      cong;

1191:   PetscLayoutSetUp(A->rmap);
1192:   PetscLayoutSetUp(A->cmap);
1193:   PetscObjectGetComm((PetscObject)A,&comm);
1194:   MatHasCongruentLayouts(A,&cong);
1195:   if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1196:   N    = A->rmap->N;
1197:   MPI_Comm_size(comm,&size);
1198:   if (size > 1 && cdist) {
1199:     PetscSF      sf;
1200:     MPI_Datatype dtype;

1202:     MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1203:     MPI_Type_commit(&dtype);

1205:     PetscSFCreate(comm,&sf);
1206:     PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1207:     PetscMalloc1(spacedim*N,&gcoords);
1208:     PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1209:     PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1210:     PetscSFDestroy(&sf);
1211:     MPI_Type_free(&dtype);
1212:   } else gcoords = (PetscReal*)coords;

1214:   delete h2opus->ptcloud;
1215:   delete h2opus->kernel;
1216:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1217:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1218:   if (gcoords != coords) { PetscFree(gcoords); }
1219:   A->preallocated = PETSC_TRUE;
1220:   return(0);
1221: }

1223: #if defined(PETSC_H2OPUS_USE_GPU)
1224: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1225: {
1226:   PetscMPIInt    size;
1227:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1231:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1232:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1233:     if (size > 1) {
1234:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1235: #if defined(H2OPUS_USE_MPI)
1236:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1237:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1238: #endif
1239:     } else {
1240:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1241:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1242:       else *a->hmatrix = *a->hmatrix_gpu;
1243:     }
1244:     delete a->hmatrix_gpu;
1245:     delete a->dist_hmatrix_gpu;
1246:     a->hmatrix_gpu = NULL;
1247:     a->dist_hmatrix_gpu = NULL;
1248:     A->offloadmask = PETSC_OFFLOAD_CPU;
1249:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1250:     if (size > 1) {
1251:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1252: #if defined(H2OPUS_USE_MPI)
1253:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1254:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1255: #endif
1256:     } else {
1257:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1258:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1259:       else *a->hmatrix_gpu = *a->hmatrix;
1260:     }
1261:     delete a->hmatrix;
1262:     delete a->dist_hmatrix;
1263:     a->hmatrix = NULL;
1264:     a->dist_hmatrix = NULL;
1265:     A->offloadmask = PETSC_OFFLOAD_GPU;
1266:   }
1267:   PetscFree(A->defaultvectype);
1268:   if (!flg) {
1269:     PetscStrallocpy(VECCUDA,&A->defaultvectype);
1270:   } else {
1271:     PetscStrallocpy(VECSTANDARD,&A->defaultvectype);
1272:   }
1273:   A->boundtocpu = flg;
1274:   return(0);
1275: }
1276: #endif

1278: /*MC
1279:      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.

1281:    Options Database Keys:
1282: .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()

1284:    Notes:
1285:      H2Opus implements hierarchical matrices in the H^2 flavour.
1286:      It supports CPU or NVIDIA GPUs.
1287:      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1288:      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1289:      For details and additional references, see
1290:        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1291:      available at https://arxiv.org/abs/2109.05451.

1293:    Level: beginner

1295: .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1296: M*/
1297: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1298: {
1299:   Mat_H2OPUS     *a;
1301:   PetscMPIInt    size;

1304: #if defined(PETSC_H2OPUS_USE_GPU)
1305:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1306: #endif
1307:   PetscNewLog(A,&a);
1308:   A->data = (void*)a;

1310:   a->eta              = 0.9;
1311:   a->leafsize         = 32;
1312:   a->basisord         = 4;
1313:   a->max_rank         = 64;
1314:   a->bs               = 32;
1315:   a->rtol             = 1.e-4;
1316:   a->s                = 1.0;
1317:   a->norm_max_samples = 10;
1318: #if defined(H2OPUS_USE_MPI)
1319:   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1320: #else
1321:   h2opusCreateHandle(&a->handle);
1322: #endif
1323:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1324:   PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS);
1325:   PetscMemzero(A->ops,sizeof(struct _MatOps));

1327:   A->ops->destroy          = MatDestroy_H2OPUS;
1328:   A->ops->view             = MatView_H2OPUS;
1329:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1330:   A->ops->mult             = MatMult_H2OPUS;
1331:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1332:   A->ops->multadd          = MatMultAdd_H2OPUS;
1333:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1334:   A->ops->scale            = MatScale_H2OPUS;
1335:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1336:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1337:   A->ops->norm             = MatNorm_H2OPUS;
1338:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1339: #if defined(PETSC_H2OPUS_USE_GPU)
1340:   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1341: #endif

1343:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS);
1344:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS);
1345:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS);
1346:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS);
1347: #if defined(PETSC_H2OPUS_USE_GPU)
1348:   PetscFree(A->defaultvectype);
1349:   PetscStrallocpy(VECCUDA,&A->defaultvectype);
1350: #endif
1351:   return(0);
1352: }

1354: /*@C
1355:      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1357:    Input Parameter:
1358: .     A - the matrix

1360:    Level: intermediate

1362: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1363: */
1364: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1365: {
1367:   PetscBool      ish2opus;
1368:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1369:   PetscMPIInt    size;
1370:   PetscBool      boundtocpu = PETSC_TRUE;

1375:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1376:   if (!ish2opus) return(0);
1377:   if (a->orthogonal) return(0);
1378:   HLibProfile::clear();
1379:   PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0);
1380: #if defined(PETSC_H2OPUS_USE_GPU)
1381:   boundtocpu = A->boundtocpu;
1382: #endif
1383:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1384:   if (size > 1) {
1385:     if (boundtocpu) {
1386:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1387: #if defined(H2OPUS_USE_MPI)
1388:       distributed_horthog(*a->dist_hmatrix, a->handle);
1389: #endif
1390: #if defined(PETSC_H2OPUS_USE_GPU)
1391:       A->offloadmask = PETSC_OFFLOAD_CPU;
1392:     } else {
1393:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1394:       PetscLogGpuTimeBegin();
1395: #if defined(H2OPUS_USE_MPI)
1396:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1397: #endif
1398:       PetscLogGpuTimeEnd();
1399: #endif
1400:     }
1401:   } else {
1402: #if defined(H2OPUS_USE_MPI)
1403:     h2opusHandle_t handle = a->handle->handle;
1404: #else
1405:     h2opusHandle_t handle = a->handle;
1406: #endif
1407:     if (boundtocpu) {
1408:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1409:       horthog(*a->hmatrix, handle);
1410: #if defined(PETSC_H2OPUS_USE_GPU)
1411:       A->offloadmask = PETSC_OFFLOAD_CPU;
1412:     } else {
1413:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1414:       PetscLogGpuTimeBegin();
1415:       horthog(*a->hmatrix_gpu, handle);
1416:       PetscLogGpuTimeEnd();
1417: #endif
1418:     }
1419:   }
1420:   a->orthogonal = PETSC_TRUE;
1421:   { /* log flops */
1422:     double gops,time,perf,dev;
1423:     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1424: #if defined(PETSC_H2OPUS_USE_GPU)
1425:     if (boundtocpu) {
1426:       PetscLogFlops(1e9*gops);
1427:     } else {
1428:       PetscLogGpuFlops(1e9*gops);
1429:     }
1430: #else
1431:     PetscLogFlops(1e9*gops);
1432: #endif
1433:   }
1434:   PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0);
1435:   return(0);
1436: }

1438: /*@C
1439:      MatH2OpusCompress - Compress a hierarchical matrix.

1441:    Input Parameters:
1442: +     A - the matrix
1443: -     tol - the absolute truncation threshold

1445:    Level: intermediate

1447: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1448: */
1449: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1450: {
1452:   PetscBool      ish2opus;
1453:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1454:   PetscMPIInt    size;
1455:   PetscBool      boundtocpu = PETSC_TRUE;

1460:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1461:   if (!ish2opus) return(0);
1462:   MatH2OpusOrthogonalize(A);
1463:   HLibProfile::clear();
1464:   PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0);
1465: #if defined(PETSC_H2OPUS_USE_GPU)
1466:   boundtocpu = A->boundtocpu;
1467: #endif
1468:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1469:   if (size > 1) {
1470:     if (boundtocpu) {
1471:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1472: #if defined(H2OPUS_USE_MPI)
1473:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1474: #endif
1475: #if defined(PETSC_H2OPUS_USE_GPU)
1476:       A->offloadmask = PETSC_OFFLOAD_CPU;
1477:     } else {
1478:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1479:       PetscLogGpuTimeBegin();
1480: #if defined(H2OPUS_USE_MPI)
1481:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1482: #endif
1483:       PetscLogGpuTimeEnd();
1484: #endif
1485:     }
1486:   } else {
1487: #if defined(H2OPUS_USE_MPI)
1488:     h2opusHandle_t handle = a->handle->handle;
1489: #else
1490:     h2opusHandle_t handle = a->handle;
1491: #endif
1492:     if (boundtocpu) {
1493:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1494:       hcompress(*a->hmatrix, tol, handle);
1495: #if defined(PETSC_H2OPUS_USE_GPU)
1496:       A->offloadmask = PETSC_OFFLOAD_CPU;
1497:     } else {
1498:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1499:       PetscLogGpuTimeBegin();
1500:       hcompress(*a->hmatrix_gpu, tol, handle);
1501:       PetscLogGpuTimeEnd();
1502: #endif
1503:     }
1504:   }
1505:   { /* log flops */
1506:     double gops,time,perf,dev;
1507:     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1508: #if defined(PETSC_H2OPUS_USE_GPU)
1509:     if (boundtocpu) {
1510:       PetscLogFlops(1e9*gops);
1511:     } else {
1512:       PetscLogGpuFlops(1e9*gops);
1513:     }
1514: #else
1515:     PetscLogFlops(1e9*gops);
1516: #endif
1517:   }
1518:   PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0);
1519:   return(0);
1520: }

1522: /*@C
1523:      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.

1525:    Input Parameters:
1526: +     A - the hierarchical matrix
1527: .     B - the matrix to be sampled
1528: .     bs - maximum number of samples to be taken concurrently
1529: -     tol - relative tolerance for construction

1531:    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.

1533:    Level: intermediate

1535: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1536: */
1537: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1538: {
1539:   PetscBool      ish2opus;

1546:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1547:   if (ish2opus) {
1548:     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

1550:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1551:     a->sampler->SetSamplingMat(B);
1552:     if (bs > 0) a->bs = bs;
1553:     if (tol > 0.) a->rtol = tol;
1554:     delete a->kernel;
1555:   }
1556:   return(0);
1557: }

1559: /*@C
1560:      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.

1562:    Input Parameters:
1563: +     comm - MPI communicator
1564: .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1565: .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1566: .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1567: .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1568: .     spacedim - dimension of the space coordinates
1569: .     coords - coordinates of the points
1570: .     cdist - whether or not coordinates are distributed
1571: .     kernel - computational kernel (or NULL)
1572: .     kernelctx - kernel context
1573: .     eta - admissibility condition tolerance
1574: .     leafsize - leaf size in cluster tree
1575: -     basisord - approximation order for Chebychev interpolation of low-rank blocks

1577:    Output Parameter:
1578: .     nA - matrix

1580:    Options Database Keys:
1581: +     -mat_h2opus_leafsize <PetscInt>
1582: .     -mat_h2opus_eta <PetscReal>
1583: .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1584: -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms

1586:    Level: intermediate

1588: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1589: @*/
1590: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1591: {
1592:   Mat            A;
1593:   Mat_H2OPUS     *h2opus;
1594: #if defined(PETSC_H2OPUS_USE_GPU)
1595:   PetscBool      iscpu = PETSC_FALSE;
1596: #else
1597:   PetscBool      iscpu = PETSC_TRUE;
1598: #endif

1602:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1603:   MatCreate(comm,&A);
1604:   MatSetSizes(A,m,n,M,N);
1605:   if (M != N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1606:   MatSetType(A,MATH2OPUS);
1607:   MatBindToCPU(A,iscpu);
1608:   MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx);

1610:   h2opus = (Mat_H2OPUS*)A->data;
1611:   if (eta > 0.) h2opus->eta = eta;
1612:   if (leafsize > 0) h2opus->leafsize = leafsize;
1613:   if (basisord > 0) h2opus->basisord = basisord;

1615:   *nA = A;
1616:   return(0);
1617: }

1619: /*@C
1620:      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.

1622:    Input Parameters:
1623: +     B - the matrix to be sampled
1624: .     spacedim - dimension of the space coordinates
1625: .     coords - coordinates of the points
1626: .     cdist - whether or not coordinates are distributed
1627: .     eta - admissibility condition tolerance
1628: .     leafsize - leaf size in cluster tree
1629: .     maxrank - maximum rank allowed
1630: .     bs - maximum number of samples to be taken concurrently
1631: -     rtol - relative tolerance for construction

1633:    Output Parameter:
1634: .     nA - matrix

1636:    Options Database Keys:
1637: +     -mat_h2opus_leafsize <PetscInt>
1638: .     -mat_h2opus_eta <PetscReal>
1639: .     -mat_h2opus_maxrank <PetscInt>
1640: .     -mat_h2opus_samples <PetscInt>
1641: .     -mat_h2opus_rtol <PetscReal>
1642: .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1643: .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1644: -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms

1646:    Notes: not available in parallel

1648:    Level: intermediate

1650: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1651: @*/
1652: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1653: {
1654:   Mat            A;
1655:   Mat_H2OPUS     *h2opus;
1656:   MPI_Comm       comm;
1657:   PetscBool      boundtocpu = PETSC_TRUE;

1669:   PetscObjectGetComm((PetscObject)B,&comm);
1670:   if (B->rmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1671:   if (B->rmap->N != B->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1672:   MatCreate(comm,&A);
1673:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1674: #if defined(PETSC_H2OPUS_USE_GPU)
1675:   {
1676:     PetscBool iscuda;
1677:     VecType   vtype;

1679:     MatGetVecType(B,&vtype);
1680:     PetscStrcmp(vtype,VECCUDA,&iscuda);
1681:     if (!iscuda) {
1682:       PetscStrcmp(vtype,VECSEQCUDA,&iscuda);
1683:       if (!iscuda) {
1684:         PetscStrcmp(vtype,VECMPICUDA,&iscuda);
1685:       }
1686:     }
1687:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1688:   }
1689: #endif
1690:   MatSetType(A,MATH2OPUS);
1691:   MatBindToCPU(A,boundtocpu);
1692:   if (spacedim) {
1693:     MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL);
1694:   }
1695:   MatPropagateSymmetryOptions(B,A);
1696:   /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1698:   h2opus = (Mat_H2OPUS*)A->data;
1699:   h2opus->sampler = new PetscMatrixSampler(B);
1700:   if (eta > 0.) h2opus->eta = eta;
1701:   if (leafsize > 0) h2opus->leafsize = leafsize;
1702:   if (maxrank > 0) h2opus->max_rank = maxrank;
1703:   if (bs > 0) h2opus->bs = bs;
1704:   if (rtol > 0.) h2opus->rtol = rtol;
1705:   *nA = A;
1706:   A->preallocated = PETSC_TRUE;
1707:   return(0);
1708: }

1710: /*@C
1711:      MatH2OpusGetIndexMap - Access reordering index set.

1713:    Input Parameters:
1714: .     A - the matrix

1716:    Output Parameter:
1717: .     indexmap - the index set for the reordering

1719:    Level: intermediate

1721: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1722: @*/
1723: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1724: {
1725:   PetscBool      ish2opus;
1726:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1733:   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1734:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1735:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1736:   *indexmap = a->h2opus_indexmap;
1737:   return(0);
1738: }

1740: /*@C
1741:      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1743:    Input Parameters:
1744: +     A - the matrix
1745: .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1746: -     in - the vector to be mapped

1748:    Output Parameter:
1749: .     out - the newly created mapped vector

1751:    Level: intermediate

1753: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1754: */
1755: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1756: {
1757:   PetscBool      ish2opus;
1758:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1759:   PetscScalar    *xin,*xout;
1760:   PetscBool      nm;

1769:   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1770:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1771:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1772:   nm   = a->nativemult;
1773:   MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc);
1774:   MatCreateVecs(A,out,NULL);
1775:   MatH2OpusSetNativeMult(A,nm);
1776:   if (!a->sf) { /* same ordering */
1777:     VecCopy(in,*out);
1778:     return(0);
1779:   }
1780:   VecGetArrayRead(in,(const PetscScalar**)&xin);
1781:   VecGetArrayWrite(*out,&xout);
1782:   if (nativetopetsc) {
1783:     PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1784:     PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1785:   } else {
1786:     PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1787:     PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1788:   }
1789:   VecRestoreArrayRead(in,(const PetscScalar**)&xin);
1790:   VecRestoreArrayWrite(*out,&xout);
1791:   return(0);
1792: }
1793: #endif