Actual source code: baijmkl.c

  1: /*
  2:   Defines basic operations for the MATSEQBAIJMKL matrix class.
  3:   Uses sparse BLAS operations from the Intel Math Kernel Library (MKL)
  4:   wherever possible. If used MKL version is older than 11.3 PETSc default
  5:   code for sparse matrix operations is used.
  6: */

  8: #include <../src/mat/impls/baij/seq/baij.h>
  9: #include <../src/mat/impls/baij/seq/baijmkl/baijmkl.h>
 10: #if defined(PETSC_HAVE_MKL_INTEL_ILP64)
 11:   #define MKL_ILP64
 12: #endif
 13: #include <mkl_spblas.h>

 15: static PetscBool PetscSeqBAIJSupportsZeroBased(void)
 16: {
 17:   static PetscBool set = PETSC_FALSE, value;
 18:   int              n   = 1, ia[1], ja[1];
 19:   float            a[1];
 20:   sparse_status_t  status;
 21:   sparse_matrix_t  A;

 23:   if (!set) {
 24:     status = mkl_sparse_s_create_bsr(&A, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_COLUMN_MAJOR, (MKL_INT)n, (MKL_INT)n, (MKL_INT)n, (MKL_INT *)ia, (MKL_INT *)ia, (MKL_INT *)ja, a);
 25:     value  = (status != SPARSE_STATUS_NOT_SUPPORTED) ? PETSC_TRUE : PETSC_FALSE;
 26:     (void)mkl_sparse_destroy(A);
 27:     set = PETSC_TRUE;
 28:   }
 29:   return value;
 30: }

 32: typedef struct {
 33:   PetscBool           sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
 34:   sparse_matrix_t     bsrA;             /* "Handle" used by SpMV2 inspector-executor routines. */
 35:   struct matrix_descr descr;
 36:   PetscInt           *ai1;
 37:   PetscInt           *aj1;
 38: } Mat_SeqBAIJMKL;

 40: static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode);
 41: extern PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat, MatAssemblyType);

 43: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJMKL_SeqBAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
 44: {
 45:   /* This routine is only called to convert a MATBAIJMKL to its base PETSc type, */
 46:   /* so we will ignore 'MatType type'. */
 47:   Mat             B       = *newmat;
 48:   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;

 50:   PetscFunctionBegin;
 51:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));

 53:   /* Reset the original function pointers. */
 54:   B->ops->duplicate        = MatDuplicate_SeqBAIJ;
 55:   B->ops->assemblyend      = MatAssemblyEnd_SeqBAIJ;
 56:   B->ops->destroy          = MatDestroy_SeqBAIJ;
 57:   B->ops->multtranspose    = MatMultTranspose_SeqBAIJ;
 58:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJ;
 59:   B->ops->scale            = MatScale_SeqBAIJ;
 60:   B->ops->diagonalscale    = MatDiagonalScale_SeqBAIJ;
 61:   B->ops->axpy             = MatAXPY_SeqBAIJ;

 63:   switch (A->rmap->bs) {
 64:   case 1:
 65:     B->ops->mult    = MatMult_SeqBAIJ_1;
 66:     B->ops->multadd = MatMultAdd_SeqBAIJ_1;
 67:     break;
 68:   case 2:
 69:     B->ops->mult    = MatMult_SeqBAIJ_2;
 70:     B->ops->multadd = MatMultAdd_SeqBAIJ_2;
 71:     break;
 72:   case 3:
 73:     B->ops->mult    = MatMult_SeqBAIJ_3;
 74:     B->ops->multadd = MatMultAdd_SeqBAIJ_3;
 75:     break;
 76:   case 4:
 77:     B->ops->mult    = MatMult_SeqBAIJ_4;
 78:     B->ops->multadd = MatMultAdd_SeqBAIJ_4;
 79:     break;
 80:   case 5:
 81:     B->ops->mult    = MatMult_SeqBAIJ_5;
 82:     B->ops->multadd = MatMultAdd_SeqBAIJ_5;
 83:     break;
 84:   case 6:
 85:     B->ops->mult    = MatMult_SeqBAIJ_6;
 86:     B->ops->multadd = MatMultAdd_SeqBAIJ_6;
 87:     break;
 88:   case 7:
 89:     B->ops->mult    = MatMult_SeqBAIJ_7;
 90:     B->ops->multadd = MatMultAdd_SeqBAIJ_7;
 91:     break;
 92:   case 11:
 93:     B->ops->mult    = MatMult_SeqBAIJ_11;
 94:     B->ops->multadd = MatMultAdd_SeqBAIJ_11;
 95:     break;
 96:   case 15:
 97:     B->ops->mult    = MatMult_SeqBAIJ_15_ver1;
 98:     B->ops->multadd = MatMultAdd_SeqBAIJ_N;
 99:     break;
100:   default:
101:     B->ops->mult    = MatMult_SeqBAIJ_N;
102:     B->ops->multadd = MatMultAdd_SeqBAIJ_N;
103:     break;
104:   }
105:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaijmkl_seqbaij_C", NULL));

107:   /* Free everything in the Mat_SeqBAIJMKL data structure. Currently, this
108:    * simply involves destroying the MKL sparse matrix handle and then freeing
109:    * the spptr pointer. */
110:   if (reuse == MAT_INITIAL_MATRIX) baijmkl = (Mat_SeqBAIJMKL *)B->spptr;

112:   if (baijmkl->sparse_optimized) PetscCallExternal(mkl_sparse_destroy, baijmkl->bsrA);
113:   PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
114:   PetscCall(PetscFree(B->spptr));

116:   /* Change the type of B to MATSEQBAIJ. */
117:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));

119:   *newmat = B;
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: static PetscErrorCode MatDestroy_SeqBAIJMKL(Mat A)
124: {
125:   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;

127:   PetscFunctionBegin;
128:   if (baijmkl) {
129:     /* Clean up everything in the Mat_SeqBAIJMKL data structure, then free A->spptr. */
130:     if (baijmkl->sparse_optimized) PetscCallExternal(mkl_sparse_destroy, baijmkl->bsrA);
131:     PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
132:     PetscCall(PetscFree(A->spptr));
133:   }

135:   /* Change the type of A back to SEQBAIJ and use MatDestroy_SeqBAIJ()
136:    * to destroy everything that remains. */
137:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQBAIJ));
138:   PetscCall(MatDestroy_SeqBAIJ(A));
139:   PetscFunctionReturn(PETSC_SUCCESS);
140: }

142: static PetscErrorCode MatSeqBAIJMKL_create_mkl_handle(Mat A)
143: {
144:   Mat_SeqBAIJ    *a       = (Mat_SeqBAIJ *)A->data;
145:   Mat_SeqBAIJMKL *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
146:   PetscInt        mbs, nbs, nz, bs;
147:   MatScalar      *aa;
148:   PetscInt       *aj, *ai;
149:   PetscInt        i;

151:   PetscFunctionBegin;
152:   if (baijmkl->sparse_optimized) {
153:     /* Matrix has been previously assembled and optimized. Must destroy old
154:      * matrix handle before running the optimization step again. */
155:     PetscCall(PetscFree2(baijmkl->ai1, baijmkl->aj1));
156:     PetscCallMKL(mkl_sparse_destroy(baijmkl->bsrA));
157:   }
158:   baijmkl->sparse_optimized = PETSC_FALSE;

160:   /* Now perform the SpMV2 setup and matrix optimization. */
161:   baijmkl->descr.type = SPARSE_MATRIX_TYPE_GENERAL;
162:   baijmkl->descr.mode = SPARSE_FILL_MODE_LOWER;
163:   baijmkl->descr.diag = SPARSE_DIAG_NON_UNIT;
164:   mbs                 = a->mbs;
165:   nbs                 = a->nbs;
166:   nz                  = a->nz;
167:   bs                  = A->rmap->bs;
168:   aa                  = a->a;

170:   if ((nz != 0) & !A->structure_only) {
171:     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
172:      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
173:     if (PetscSeqBAIJSupportsZeroBased()) {
174:       aj = a->j;
175:       ai = a->i;
176:       PetscCallMKL(mkl_sparse_x_create_bsr(&baijmkl->bsrA, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_COLUMN_MAJOR, (MKL_INT)mbs, (MKL_INT)nbs, (MKL_INT)bs, (MKL_INT *)ai, (MKL_INT *)(ai + 1), (MKL_INT *)aj, aa));
177:     } else {
178:       PetscCall(PetscMalloc2(mbs + 1, &ai, nz, &aj));
179:       for (i = 0; i < mbs + 1; i++) ai[i] = a->i[i] + 1;
180:       for (i = 0; i < nz; i++) aj[i] = a->j[i] + 1;
181:       aa = a->a;
182:       PetscCallMKL(mkl_sparse_x_create_bsr(&baijmkl->bsrA, SPARSE_INDEX_BASE_ONE, SPARSE_LAYOUT_COLUMN_MAJOR, (MKL_INT)mbs, (MKL_INT)nbs, (MKL_INT)bs, (MKL_INT *)ai, (MKL_INT *)(ai + 1), (MKL_INT *)aj, aa));
183:       baijmkl->ai1 = ai;
184:       baijmkl->aj1 = aj;
185:     }
186:     PetscCallMKL(mkl_sparse_set_mv_hint(baijmkl->bsrA, SPARSE_OPERATION_NON_TRANSPOSE, baijmkl->descr, 1000));
187:     PetscCallMKL(mkl_sparse_set_memory_hint(baijmkl->bsrA, SPARSE_MEMORY_AGGRESSIVE));
188:     PetscCallMKL(mkl_sparse_optimize(baijmkl->bsrA));
189:     baijmkl->sparse_optimized = PETSC_TRUE;
190:   }
191:   PetscFunctionReturn(PETSC_SUCCESS);
192: }

194: static PetscErrorCode MatDuplicate_SeqBAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
195: {
196:   Mat_SeqBAIJMKL *baijmkl;
197:   Mat_SeqBAIJMKL *baijmkl_dest;

199:   PetscFunctionBegin;
200:   PetscCall(MatDuplicate_SeqBAIJ(A, op, M));
201:   baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
202:   PetscCall(PetscNew(&baijmkl_dest));
203:   (*M)->spptr = (void *)baijmkl_dest;
204:   PetscCall(PetscMemcpy(baijmkl_dest, baijmkl, sizeof(Mat_SeqBAIJMKL)));
205:   baijmkl_dest->sparse_optimized = PETSC_FALSE;
206:   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
207:   PetscFunctionReturn(PETSC_SUCCESS);
208: }

210: static PetscErrorCode MatMult_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy)
211: {
212:   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
213:   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
214:   const PetscScalar *x;
215:   PetscScalar       *y;

217:   PetscFunctionBegin;
218:   /* If there are no nonzero entries, zero yy and return immediately. */
219:   if (!a->nz) {
220:     PetscCall(VecSet(yy, 0.0));
221:     PetscFunctionReturn(PETSC_SUCCESS);
222:   }

224:   PetscCall(VecGetArrayRead(xx, &x));
225:   PetscCall(VecGetArray(yy, &y));

227:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
228:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
229:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
230:   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));

232:   /* Call MKL SpMV2 executor routine to do the MatMult. */
233:   PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, y));

235:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - a->nonzerorowcnt * A->rmap->bs));
236:   PetscCall(VecRestoreArrayRead(xx, &x));
237:   PetscCall(VecRestoreArray(yy, &y));
238:   PetscFunctionReturn(PETSC_SUCCESS);
239: }

241: static PetscErrorCode MatMultTranspose_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy)
242: {
243:   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
244:   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
245:   const PetscScalar *x;
246:   PetscScalar       *y;

248:   PetscFunctionBegin;
249:   /* If there are no nonzero entries, zero yy and return immediately. */
250:   if (!a->nz) {
251:     PetscCall(VecSet(yy, 0.0));
252:     PetscFunctionReturn(PETSC_SUCCESS);
253:   }

255:   PetscCall(VecGetArrayRead(xx, &x));
256:   PetscCall(VecGetArray(yy, &y));

258:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
259:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
260:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
261:   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));

263:   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
264:   PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, y));

266:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - a->nonzerorowcnt * A->rmap->bs));
267:   PetscCall(VecRestoreArrayRead(xx, &x));
268:   PetscCall(VecRestoreArray(yy, &y));
269:   PetscFunctionReturn(PETSC_SUCCESS);
270: }

272: static PetscErrorCode MatMultAdd_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy, Vec zz)
273: {
274:   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
275:   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
276:   const PetscScalar *x;
277:   PetscScalar       *y, *z;
278:   PetscInt           m = a->mbs * A->rmap->bs;
279:   PetscInt           i;

281:   PetscFunctionBegin;
282:   /* If there are no nonzero entries, set zz = yy and return immediately. */
283:   if (!a->nz) {
284:     PetscCall(VecCopy(yy, zz));
285:     PetscFunctionReturn(PETSC_SUCCESS);
286:   }

288:   PetscCall(VecGetArrayRead(xx, &x));
289:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));

291:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
292:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
293:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
294:   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));

296:   /* Call MKL sparse BLAS routine to do the MatMult. */
297:   if (zz == yy) {
298:     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
299:      * with alpha and beta both set to 1.0. */
300:     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 1.0, z));
301:   } else {
302:     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
303:      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
304:     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, z));
305:     for (i = 0; i < m; i++) z[i] += y[i];
306:   }

308:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz));
309:   PetscCall(VecRestoreArrayRead(xx, &x));
310:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
311:   PetscFunctionReturn(PETSC_SUCCESS);
312: }

314: static PetscErrorCode MatMultTransposeAdd_SeqBAIJMKL_SpMV2(Mat A, Vec xx, Vec yy, Vec zz)
315: {
316:   Mat_SeqBAIJ       *a       = (Mat_SeqBAIJ *)A->data;
317:   Mat_SeqBAIJMKL    *baijmkl = (Mat_SeqBAIJMKL *)A->spptr;
318:   const PetscScalar *x;
319:   PetscScalar       *y, *z;
320:   PetscInt           n = a->nbs * A->rmap->bs;
321:   PetscInt           i;
322:   /* Variables not in MatMultTransposeAdd_SeqBAIJ. */

324:   PetscFunctionBegin;
325:   /* If there are no nonzero entries, set zz = yy and return immediately. */
326:   if (!a->nz) {
327:     PetscCall(VecCopy(yy, zz));
328:     PetscFunctionReturn(PETSC_SUCCESS);
329:   }

331:   PetscCall(VecGetArrayRead(xx, &x));
332:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));

334:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call
335:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably
336:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
337:   if (!baijmkl->sparse_optimized) PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));

339:   /* Call MKL sparse BLAS routine to do the MatMult. */
340:   if (zz == yy) {
341:     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y,
342:      * with alpha and beta both set to 1.0. */
343:     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 1.0, z));
344:   } else {
345:     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then
346:      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
347:     PetscCallMKL(mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, baijmkl->bsrA, baijmkl->descr, x, 0.0, z));
348:     for (i = 0; i < n; i++) z[i] += y[i];
349:   }

351:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz));
352:   PetscCall(VecRestoreArrayRead(xx, &x));
353:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
354:   PetscFunctionReturn(PETSC_SUCCESS);
355: }

357: static PetscErrorCode MatScale_SeqBAIJMKL(Mat inA, PetscScalar alpha)
358: {
359:   PetscFunctionBegin;
360:   PetscCall(MatScale_SeqBAIJ(inA, alpha));
361:   PetscCall(MatSeqBAIJMKL_create_mkl_handle(inA));
362:   PetscFunctionReturn(PETSC_SUCCESS);
363: }

365: static PetscErrorCode MatDiagonalScale_SeqBAIJMKL(Mat A, Vec ll, Vec rr)
366: {
367:   PetscFunctionBegin;
368:   PetscCall(MatDiagonalScale_SeqBAIJ(A, ll, rr));
369:   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
370:   PetscFunctionReturn(PETSC_SUCCESS);
371: }

373: static PetscErrorCode MatAXPY_SeqBAIJMKL(Mat Y, PetscScalar a, Mat X, MatStructure str)
374: {
375:   PetscFunctionBegin;
376:   PetscCall(MatAXPY_SeqBAIJ(Y, a, X, str));
377:   if (str == SAME_NONZERO_PATTERN) {
378:     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
379:     PetscCall(MatSeqBAIJMKL_create_mkl_handle(Y));
380:   }
381:   PetscFunctionReturn(PETSC_SUCCESS);
382: }
383: /* MatConvert_SeqBAIJ_SeqBAIJMKL converts a SeqBAIJ matrix into a
384:  * SeqBAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLBAIJ()
385:  * routine, but can also be used to convert an assembled SeqBAIJ matrix
386:  * into a SeqBAIJMKL one. */
387: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat A, MatType type, MatReuse reuse, Mat *newmat)
388: {
389:   Mat             B = *newmat;
390:   Mat_SeqBAIJMKL *baijmkl;
391:   PetscBool       sametype;

393:   PetscFunctionBegin;
394:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));

396:   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
397:   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);

399:   PetscCall(PetscNew(&baijmkl));
400:   B->spptr = (void *)baijmkl;

402:   /* Set function pointers for methods that we inherit from BAIJ but override.
403:    * We also parse some command line options below, since those determine some of the methods we point to. */
404:   B->ops->assemblyend = MatAssemblyEnd_SeqBAIJMKL;

406:   baijmkl->sparse_optimized = PETSC_FALSE;

408:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatScale_C", MatScale_SeqBAIJMKL));
409:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaijmkl_seqbaij_C", MatConvert_SeqBAIJMKL_SeqBAIJ));

411:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJMKL));
412:   *newmat = B;
413:   PetscFunctionReturn(PETSC_SUCCESS);
414: }

416: static PetscErrorCode MatAssemblyEnd_SeqBAIJMKL(Mat A, MatAssemblyType mode)
417: {
418:   PetscFunctionBegin;
419:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
420:   PetscCall(MatAssemblyEnd_SeqBAIJ(A, mode));
421:   PetscCall(MatSeqBAIJMKL_create_mkl_handle(A));
422:   A->ops->destroy          = MatDestroy_SeqBAIJMKL;
423:   A->ops->mult             = MatMult_SeqBAIJMKL_SpMV2;
424:   A->ops->multtranspose    = MatMultTranspose_SeqBAIJMKL_SpMV2;
425:   A->ops->multadd          = MatMultAdd_SeqBAIJMKL_SpMV2;
426:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqBAIJMKL_SpMV2;
427:   A->ops->scale            = MatScale_SeqBAIJMKL;
428:   A->ops->diagonalscale    = MatDiagonalScale_SeqBAIJMKL;
429:   A->ops->axpy             = MatAXPY_SeqBAIJMKL;
430:   A->ops->duplicate        = MatDuplicate_SeqBAIJMKL;
431:   PetscFunctionReturn(PETSC_SUCCESS);
432: }

434: /*@C
435:   MatCreateSeqBAIJMKL - Creates a sparse matrix of type `MATSEQBAIJMKL`.
436:   This type inherits from `MATSEQBAIJ` and is largely identical, but uses sparse BLAS
437:   routines from Intel MKL whenever possible.

439:   Input Parameters:
440: + comm - MPI communicator, set to `PETSC_COMM_SELF`
441: . bs   - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
442:           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
443: . m    - number of rows
444: . n    - number of columns
445: . nz   - number of nonzero blocks  per block row (same for all rows)
446: - nnz  - array containing the number of nonzero blocks in the various block rows
447:          (possibly different for each block row) or `NULL`

449:   Output Parameter:
450: . A - the matrix

452:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
453:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
454:    [MatXXXXSetPreallocation() is, for example, `MatSeqBAIJSetPreallocation()`]

456:   Options Database Keys:
457: + -mat_no_unroll  - uses code that does not unroll the loops in the block calculations (much slower)
458: - -mat_block_size - size of the blocks to use

460:   Level: intermediate

462:   Notes:
463:   The number of rows and columns must be divisible by blocksize.

465:   If the `nnz` parameter is given then the `nz` parameter is ignored

467:   A nonzero block is any block that as 1 or more nonzeros in it

469:   `MatMult()`, `MatMultAdd()`, `MatMultTranspose()`, and `MatMultTransposeAdd()`
470:   operations are currently supported.
471:   If the installed version of MKL supports the "SpMV2" sparse
472:   inspector-executor routines, then those are used by default.
473:   Default PETSc kernels are used otherwise.

475:   The `MATSEQBAIJ` format is fully compatible with standard Fortran
476:   storage.  That is, the stored row and column indices can begin at
477:   either one (as in Fortran) or zero.  See the users' manual for details.

479:   Specify the preallocated storage with either nz or nnz (not both).
480:   Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
481:   allocation.  See [Sparse Matrices](sec_matsparse) for details.
482:   matrices.

484: .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
485: @*/
486: PetscErrorCode MatCreateSeqBAIJMKL(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
487: {
488:   PetscFunctionBegin;
489:   PetscCall(MatCreate(comm, A));
490:   PetscCall(MatSetSizes(*A, m, n, m, n));
491:   PetscCall(MatSetType(*A, MATSEQBAIJMKL));
492:   PetscCall(MatSeqBAIJSetPreallocation_SeqBAIJ(*A, bs, nz, (PetscInt *)nnz));
493:   PetscFunctionReturn(PETSC_SUCCESS);
494: }

496: PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJMKL(Mat A)
497: {
498:   PetscFunctionBegin;
499:   PetscCall(MatSetType(A, MATSEQBAIJ));
500:   PetscCall(MatConvert_SeqBAIJ_SeqBAIJMKL(A, MATSEQBAIJMKL, MAT_INPLACE_MATRIX, &A));
501:   PetscFunctionReturn(PETSC_SUCCESS);
502: }