Actual source code: matmatmult.c

  1: /*
  2:   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
  3:           C = A * B
  4: */

  6: #include <../src/mat/impls/aij/seq/aij.h>
  7: #include <../src/mat/utils/freespace.h>
  8: #include <petscbt.h>
  9: #include <petsc/private/isimpl.h>
 10: #include <../src/mat/impls/dense/seq/dense.h>

 12: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
 13: {
 14:   PetscFunctionBegin;
 15:   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
 16:   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
 17:   PetscFunctionReturn(PETSC_SUCCESS);
 18: }

 20: /* Modified from MatCreateSeqAIJWithArrays() */
 21: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
 22: {
 23:   PetscInt    ii;
 24:   Mat_SeqAIJ *aij;
 25:   PetscBool   isseqaij, ofree_a, ofree_ij;

 27:   PetscFunctionBegin;
 28:   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
 29:   PetscCall(MatSetSizes(mat, m, n, m, n));

 31:   if (!mtype) {
 32:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
 33:     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
 34:   } else {
 35:     PetscCall(MatSetType(mat, mtype));
 36:   }

 38:   aij      = (Mat_SeqAIJ *)mat->data;
 39:   ofree_a  = aij->free_a;
 40:   ofree_ij = aij->free_ij;
 41:   /* changes the free flags */
 42:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));

 44:   PetscCall(PetscFree(aij->ilen));
 45:   PetscCall(PetscFree(aij->imax));
 46:   PetscCall(PetscMalloc1(m, &aij->imax));
 47:   PetscCall(PetscMalloc1(m, &aij->ilen));
 48:   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
 49:     const PetscInt rnz = i[ii + 1] - i[ii];
 50:     aij->nonzerorowcnt += !!rnz;
 51:     aij->rmax     = PetscMax(aij->rmax, rnz);
 52:     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
 53:   }
 54:   aij->maxnz = i[m];
 55:   aij->nz    = i[m];

 57:   if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
 58:   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
 59:   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));

 61:   aij->i       = i;
 62:   aij->j       = j;
 63:   aij->a       = a;
 64:   aij->nonew   = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
 65:   aij->free_a  = PETSC_FALSE;
 66:   aij->free_ij = PETSC_FALSE;
 67:   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
 68:   // Always build the diag info when i, j are set
 69:   PetscCall(MatMarkDiagonal_SeqAIJ(mat));
 70:   PetscFunctionReturn(PETSC_SUCCESS);
 71: }

 73: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
 74: {
 75:   Mat_Product        *product = C->product;
 76:   MatProductAlgorithm alg;
 77:   PetscBool           flg;

 79:   PetscFunctionBegin;
 80:   if (product) {
 81:     alg = product->alg;
 82:   } else {
 83:     alg = "sorted";
 84:   }
 85:   /* sorted */
 86:   PetscCall(PetscStrcmp(alg, "sorted", &flg));
 87:   if (flg) {
 88:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
 89:     PetscFunctionReturn(PETSC_SUCCESS);
 90:   }

 92:   /* scalable */
 93:   PetscCall(PetscStrcmp(alg, "scalable", &flg));
 94:   if (flg) {
 95:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
 96:     PetscFunctionReturn(PETSC_SUCCESS);
 97:   }

 99:   /* scalable_fast */
100:   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
101:   if (flg) {
102:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
103:     PetscFunctionReturn(PETSC_SUCCESS);
104:   }

106:   /* heap */
107:   PetscCall(PetscStrcmp(alg, "heap", &flg));
108:   if (flg) {
109:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
110:     PetscFunctionReturn(PETSC_SUCCESS);
111:   }

113:   /* btheap */
114:   PetscCall(PetscStrcmp(alg, "btheap", &flg));
115:   if (flg) {
116:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
117:     PetscFunctionReturn(PETSC_SUCCESS);
118:   }

120:   /* llcondensed */
121:   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
122:   if (flg) {
123:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
124:     PetscFunctionReturn(PETSC_SUCCESS);
125:   }

127:   /* rowmerge */
128:   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
129:   if (flg) {
130:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
131:     PetscFunctionReturn(PETSC_SUCCESS);
132:   }

134: #if defined(PETSC_HAVE_HYPRE)
135:   PetscCall(PetscStrcmp(alg, "hypre", &flg));
136:   if (flg) {
137:     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
138:     PetscFunctionReturn(PETSC_SUCCESS);
139:   }
140: #endif

142:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
143: }

145: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
146: {
147:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
148:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
149:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
150:   PetscReal          afill;
151:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
152:   PetscHMapI         ta;
153:   PetscBT            lnkbt;
154:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

156:   PetscFunctionBegin;
157:   /* Get ci and cj */
158:   /* Allocate ci array, arrays for fill computation and */
159:   /* free space for accumulating nonzero column info */
160:   PetscCall(PetscMalloc1(am + 2, &ci));
161:   ci[0] = 0;

163:   /* create and initialize a linked list */
164:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
165:   MatRowMergeMax_SeqAIJ(b, bm, ta);
166:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
167:   PetscCall(PetscHMapIDestroy(&ta));

169:   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));

171:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
172:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));

174:   current_space = free_space;

176:   /* Determine ci and cj */
177:   for (i = 0; i < am; i++) {
178:     anzi = ai[i + 1] - ai[i];
179:     aj   = a->j + ai[i];
180:     for (j = 0; j < anzi; j++) {
181:       brow = aj[j];
182:       bnzj = bi[brow + 1] - bi[brow];
183:       bj   = b->j + bi[brow];
184:       /* add non-zero cols of B into the sorted linked list lnk */
185:       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
186:     }
187:     /* add possible missing diagonal entry */
188:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
189:     cnzi = lnk[0];

191:     /* If free space is not available, make more free space */
192:     /* Double the amount of total space in the list */
193:     if (current_space->local_remaining < cnzi) {
194:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
195:       ndouble++;
196:     }

198:     /* Copy data into free space, then initialize lnk */
199:     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));

201:     current_space->array += cnzi;
202:     current_space->local_used += cnzi;
203:     current_space->local_remaining -= cnzi;

205:     ci[i + 1] = ci[i] + cnzi;
206:   }

208:   /* Column indices are in the list of free space */
209:   /* Allocate space for cj, initialize cj, and */
210:   /* destroy list of free space and other temporary array(s) */
211:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
212:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
213:   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));

215:   /* put together the new symbolic matrix */
216:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
217:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

219:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
220:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
221:   c          = (Mat_SeqAIJ *)C->data;
222:   c->free_a  = PETSC_FALSE;
223:   c->free_ij = PETSC_TRUE;
224:   c->nonew   = 0;

226:   /* fast, needs non-scalable O(bn) array 'abdense' */
227:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

229:   /* set MatInfo */
230:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
231:   if (afill < 1.0) afill = 1.0;
232:   C->info.mallocs           = ndouble;
233:   C->info.fill_ratio_given  = fill;
234:   C->info.fill_ratio_needed = afill;

236: #if defined(PETSC_USE_INFO)
237:   if (ci[am]) {
238:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
239:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
240:   } else {
241:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
242:   }
243: #endif
244:   PetscFunctionReturn(PETSC_SUCCESS);
245: }

247: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
248: {
249:   PetscLogDouble     flops = 0.0;
250:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
251:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
252:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
253:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
254:   PetscInt           am = A->rmap->n, cm = C->rmap->n;
255:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
256:   PetscScalar       *ca, valtmp;
257:   PetscScalar       *ab_dense;
258:   PetscContainer     cab_dense;
259:   const PetscScalar *aa, *ba, *baj;

261:   PetscFunctionBegin;
262:   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
263:   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
264:   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
265:     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
266:     c->a      = ca;
267:     c->free_a = PETSC_TRUE;
268:   } else ca = c->a;

270:   /* TODO this should be done in the symbolic phase */
271:   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
272:      that is hard to eradicate) */
273:   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
274:   if (!cab_dense) {
275:     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
276:     PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscContainerUserDestroyDefault));
277:   } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
278:   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));

280:   /* clean old values in C */
281:   PetscCall(PetscArrayzero(ca, ci[cm]));
282:   /* Traverse A row-wise. */
283:   /* Build the ith row in C by summing over nonzero columns in A, */
284:   /* the rows of B corresponding to nonzeros of A. */
285:   for (i = 0; i < am; i++) {
286:     anzi = ai[i + 1] - ai[i];
287:     for (j = 0; j < anzi; j++) {
288:       brow = aj[j];
289:       bnzi = bi[brow + 1] - bi[brow];
290:       bjj  = PetscSafePointerPlusOffset(bj, bi[brow]);
291:       baj  = PetscSafePointerPlusOffset(ba, bi[brow]);
292:       /* perform dense axpy */
293:       valtmp = aa[j];
294:       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
295:       flops += 2 * bnzi;
296:     }
297:     aj = PetscSafePointerPlusOffset(aj, anzi);
298:     aa = PetscSafePointerPlusOffset(aa, anzi);

300:     cnzi = ci[i + 1] - ci[i];
301:     for (k = 0; k < cnzi; k++) {
302:       ca[k] += ab_dense[cj[k]];
303:       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304:     }
305:     flops += cnzi;
306:     cj = PetscSafePointerPlusOffset(cj, cnzi);
307:     ca += cnzi;
308:   }
309: #if defined(PETSC_HAVE_DEVICE)
310:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
311: #endif
312:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
313:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
314:   PetscCall(PetscLogFlops(flops));
315:   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
316:   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
317:   PetscFunctionReturn(PETSC_SUCCESS);
318: }

320: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
321: {
322:   PetscLogDouble     flops = 0.0;
323:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
324:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
325:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
326:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
327:   PetscInt           am = A->rmap->N, cm = C->rmap->N;
328:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
329:   PetscScalar       *ca = c->a, valtmp;
330:   const PetscScalar *aa, *ba, *baj;
331:   PetscInt           nextb;

333:   PetscFunctionBegin;
334:   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
335:   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
336:   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
337:     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
338:     c->a      = ca;
339:     c->free_a = PETSC_TRUE;
340:   }

342:   /* clean old values in C */
343:   PetscCall(PetscArrayzero(ca, ci[cm]));
344:   /* Traverse A row-wise. */
345:   /* Build the ith row in C by summing over nonzero columns in A, */
346:   /* the rows of B corresponding to nonzeros of A. */
347:   for (i = 0; i < am; i++) {
348:     anzi = ai[i + 1] - ai[i];
349:     cnzi = ci[i + 1] - ci[i];
350:     for (j = 0; j < anzi; j++) {
351:       brow = aj[j];
352:       bnzi = bi[brow + 1] - bi[brow];
353:       bjj  = bj + bi[brow];
354:       baj  = ba + bi[brow];
355:       /* perform sparse axpy */
356:       valtmp = aa[j];
357:       nextb  = 0;
358:       for (k = 0; nextb < bnzi; k++) {
359:         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
360:           ca[k] += valtmp * baj[nextb++];
361:         }
362:       }
363:       flops += 2 * bnzi;
364:     }
365:     aj += anzi;
366:     aa += anzi;
367:     cj += cnzi;
368:     ca += cnzi;
369:   }
370: #if defined(PETSC_HAVE_DEVICE)
371:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
372: #endif
373:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
374:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
375:   PetscCall(PetscLogFlops(flops));
376:   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
377:   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
378:   PetscFunctionReturn(PETSC_SUCCESS);
379: }

381: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
382: {
383:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
384:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
385:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
386:   MatScalar         *ca;
387:   PetscReal          afill;
388:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
389:   PetscHMapI         ta;
390:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

392:   PetscFunctionBegin;
393:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
394:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
395:   PetscCall(PetscMalloc1(am + 2, &ci));
396:   ci[0] = 0;

398:   /* create and initialize a linked list */
399:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
400:   MatRowMergeMax_SeqAIJ(b, bm, ta);
401:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
402:   PetscCall(PetscHMapIDestroy(&ta));

404:   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));

406:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
407:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
408:   current_space = free_space;

410:   /* Determine ci and cj */
411:   for (i = 0; i < am; i++) {
412:     anzi = ai[i + 1] - ai[i];
413:     aj   = a->j + ai[i];
414:     for (j = 0; j < anzi; j++) {
415:       brow = aj[j];
416:       bnzj = bi[brow + 1] - bi[brow];
417:       bj   = b->j + bi[brow];
418:       /* add non-zero cols of B into the sorted linked list lnk */
419:       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
420:     }
421:     /* add possible missing diagonal entry */
422:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
423:     cnzi = lnk[1];

425:     /* If free space is not available, make more free space */
426:     /* Double the amount of total space in the list */
427:     if (current_space->local_remaining < cnzi) {
428:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
429:       ndouble++;
430:     }

432:     /* Copy data into free space, then initialize lnk */
433:     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));

435:     current_space->array += cnzi;
436:     current_space->local_used += cnzi;
437:     current_space->local_remaining -= cnzi;

439:     ci[i + 1] = ci[i] + cnzi;
440:   }

442:   /* Column indices are in the list of free space */
443:   /* Allocate space for cj, initialize cj, and */
444:   /* destroy list of free space and other temporary array(s) */
445:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
446:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
447:   PetscCall(PetscLLCondensedDestroy_fast(lnk));

449:   /* Allocate space for ca */
450:   PetscCall(PetscCalloc1(ci[am] + 1, &ca));

452:   /* put together the new symbolic matrix */
453:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
454:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

456:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
457:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
458:   c          = (Mat_SeqAIJ *)C->data;
459:   c->free_a  = PETSC_TRUE;
460:   c->free_ij = PETSC_TRUE;
461:   c->nonew   = 0;

463:   /* slower, less memory */
464:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

466:   /* set MatInfo */
467:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
468:   if (afill < 1.0) afill = 1.0;
469:   C->info.mallocs           = ndouble;
470:   C->info.fill_ratio_given  = fill;
471:   C->info.fill_ratio_needed = afill;

473: #if defined(PETSC_USE_INFO)
474:   if (ci[am]) {
475:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
476:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
477:   } else {
478:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
479:   }
480: #endif
481:   PetscFunctionReturn(PETSC_SUCCESS);
482: }

484: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
485: {
486:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
487:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
488:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
489:   MatScalar         *ca;
490:   PetscReal          afill;
491:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
492:   PetscHMapI         ta;
493:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

495:   PetscFunctionBegin;
496:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
497:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
498:   PetscCall(PetscMalloc1(am + 2, &ci));
499:   ci[0] = 0;

501:   /* create and initialize a linked list */
502:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
503:   MatRowMergeMax_SeqAIJ(b, bm, ta);
504:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
505:   PetscCall(PetscHMapIDestroy(&ta));
506:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));

508:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
509:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
510:   current_space = free_space;

512:   /* Determine ci and cj */
513:   for (i = 0; i < am; i++) {
514:     anzi = ai[i + 1] - ai[i];
515:     aj   = a->j + ai[i];
516:     for (j = 0; j < anzi; j++) {
517:       brow = aj[j];
518:       bnzj = bi[brow + 1] - bi[brow];
519:       bj   = b->j + bi[brow];
520:       /* add non-zero cols of B into the sorted linked list lnk */
521:       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
522:     }
523:     /* add possible missing diagonal entry */
524:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));

526:     cnzi = lnk[0];

528:     /* If free space is not available, make more free space */
529:     /* Double the amount of total space in the list */
530:     if (current_space->local_remaining < cnzi) {
531:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
532:       ndouble++;
533:     }

535:     /* Copy data into free space, then initialize lnk */
536:     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));

538:     current_space->array += cnzi;
539:     current_space->local_used += cnzi;
540:     current_space->local_remaining -= cnzi;

542:     ci[i + 1] = ci[i] + cnzi;
543:   }

545:   /* Column indices are in the list of free space */
546:   /* Allocate space for cj, initialize cj, and */
547:   /* destroy list of free space and other temporary array(s) */
548:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
549:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
550:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));

552:   /* Allocate space for ca */
553:   PetscCall(PetscCalloc1(ci[am] + 1, &ca));

555:   /* put together the new symbolic matrix */
556:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
557:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

559:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
560:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
561:   c          = (Mat_SeqAIJ *)C->data;
562:   c->free_a  = PETSC_TRUE;
563:   c->free_ij = PETSC_TRUE;
564:   c->nonew   = 0;

566:   /* slower, less memory */
567:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

569:   /* set MatInfo */
570:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
571:   if (afill < 1.0) afill = 1.0;
572:   C->info.mallocs           = ndouble;
573:   C->info.fill_ratio_given  = fill;
574:   C->info.fill_ratio_needed = afill;

576: #if defined(PETSC_USE_INFO)
577:   if (ci[am]) {
578:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
579:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
580:   } else {
581:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
582:   }
583: #endif
584:   PetscFunctionReturn(PETSC_SUCCESS);
585: }

587: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
588: {
589:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
590:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
591:   PetscInt          *ci, *cj, *bb;
592:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
593:   PetscReal          afill;
594:   PetscInt           i, j, col, ndouble = 0;
595:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
596:   PetscHeap          h;

598:   PetscFunctionBegin;
599:   /* Get ci and cj - by merging sorted rows using a heap */
600:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
601:   PetscCall(PetscMalloc1(am + 2, &ci));
602:   ci[0] = 0;

604:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
605:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
606:   current_space = free_space;

608:   PetscCall(PetscHeapCreate(a->rmax, &h));
609:   PetscCall(PetscMalloc1(a->rmax, &bb));

611:   /* Determine ci and cj */
612:   for (i = 0; i < am; i++) {
613:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
614:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
615:     ci[i + 1]            = ci[i];
616:     /* Populate the min heap */
617:     for (j = 0; j < anzi; j++) {
618:       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
619:       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
620:         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
621:       }
622:     }
623:     /* Pick off the min element, adding it to free space */
624:     PetscCall(PetscHeapPop(h, &j, &col));
625:     while (j >= 0) {
626:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
627:         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
628:         ndouble++;
629:       }
630:       *(current_space->array++) = col;
631:       current_space->local_used++;
632:       current_space->local_remaining--;
633:       ci[i + 1]++;

635:       /* stash if anything else remains in this row of B */
636:       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
637:       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
638:         PetscInt j2, col2;
639:         PetscCall(PetscHeapPeek(h, &j2, &col2));
640:         if (col2 != col) break;
641:         PetscCall(PetscHeapPop(h, &j2, &col2));
642:         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
643:       }
644:       /* Put any stashed elements back into the min heap */
645:       PetscCall(PetscHeapUnstash(h));
646:       PetscCall(PetscHeapPop(h, &j, &col));
647:     }
648:   }
649:   PetscCall(PetscFree(bb));
650:   PetscCall(PetscHeapDestroy(&h));

652:   /* Column indices are in the list of free space */
653:   /* Allocate space for cj, initialize cj, and */
654:   /* destroy list of free space and other temporary array(s) */
655:   PetscCall(PetscMalloc1(ci[am], &cj));
656:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));

658:   /* put together the new symbolic matrix */
659:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
660:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

662:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
663:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
664:   c          = (Mat_SeqAIJ *)C->data;
665:   c->free_a  = PETSC_TRUE;
666:   c->free_ij = PETSC_TRUE;
667:   c->nonew   = 0;

669:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

671:   /* set MatInfo */
672:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
673:   if (afill < 1.0) afill = 1.0;
674:   C->info.mallocs           = ndouble;
675:   C->info.fill_ratio_given  = fill;
676:   C->info.fill_ratio_needed = afill;

678: #if defined(PETSC_USE_INFO)
679:   if (ci[am]) {
680:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
681:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
682:   } else {
683:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
684:   }
685: #endif
686:   PetscFunctionReturn(PETSC_SUCCESS);
687: }

689: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
690: {
691:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
692:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
693:   PetscInt          *ci, *cj, *bb;
694:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
695:   PetscReal          afill;
696:   PetscInt           i, j, col, ndouble = 0;
697:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
698:   PetscHeap          h;
699:   PetscBT            bt;

701:   PetscFunctionBegin;
702:   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
703:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
704:   PetscCall(PetscMalloc1(am + 2, &ci));
705:   ci[0] = 0;

707:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
708:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));

710:   current_space = free_space;

712:   PetscCall(PetscHeapCreate(a->rmax, &h));
713:   PetscCall(PetscMalloc1(a->rmax, &bb));
714:   PetscCall(PetscBTCreate(bn, &bt));

716:   /* Determine ci and cj */
717:   for (i = 0; i < am; i++) {
718:     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
719:     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
720:     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
721:     ci[i + 1]            = ci[i];
722:     /* Populate the min heap */
723:     for (j = 0; j < anzi; j++) {
724:       PetscInt brow = acol[j];
725:       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
726:         PetscInt bcol = bj[bb[j]];
727:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
728:           PetscCall(PetscHeapAdd(h, j, bcol));
729:           bb[j]++;
730:           break;
731:         }
732:       }
733:     }
734:     /* Pick off the min element, adding it to free space */
735:     PetscCall(PetscHeapPop(h, &j, &col));
736:     while (j >= 0) {
737:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
738:         fptr = NULL;                            /* need PetscBTMemzero */
739:         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
740:         ndouble++;
741:       }
742:       *(current_space->array++) = col;
743:       current_space->local_used++;
744:       current_space->local_remaining--;
745:       ci[i + 1]++;

747:       /* stash if anything else remains in this row of B */
748:       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
749:         PetscInt bcol = bj[bb[j]];
750:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
751:           PetscCall(PetscHeapAdd(h, j, bcol));
752:           bb[j]++;
753:           break;
754:         }
755:       }
756:       PetscCall(PetscHeapPop(h, &j, &col));
757:     }
758:     if (fptr) { /* Clear the bits for this row */
759:       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
760:     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
761:       PetscCall(PetscBTMemzero(bn, bt));
762:     }
763:   }
764:   PetscCall(PetscFree(bb));
765:   PetscCall(PetscHeapDestroy(&h));
766:   PetscCall(PetscBTDestroy(&bt));

768:   /* Column indices are in the list of free space */
769:   /* Allocate space for cj, initialize cj, and */
770:   /* destroy list of free space and other temporary array(s) */
771:   PetscCall(PetscMalloc1(ci[am], &cj));
772:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));

774:   /* put together the new symbolic matrix */
775:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
776:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

778:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
779:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
780:   c          = (Mat_SeqAIJ *)C->data;
781:   c->free_a  = PETSC_TRUE;
782:   c->free_ij = PETSC_TRUE;
783:   c->nonew   = 0;

785:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

787:   /* set MatInfo */
788:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
789:   if (afill < 1.0) afill = 1.0;
790:   C->info.mallocs           = ndouble;
791:   C->info.fill_ratio_given  = fill;
792:   C->info.fill_ratio_needed = afill;

794: #if defined(PETSC_USE_INFO)
795:   if (ci[am]) {
796:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
797:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
798:   } else {
799:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
800:   }
801: #endif
802:   PetscFunctionReturn(PETSC_SUCCESS);
803: }

805: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
806: {
807:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
808:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
809:   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
810:   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
811:   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
812:   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
813:   const PetscInt *brow_ptr[8], *brow_end[8];
814:   PetscInt        window[8];
815:   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
816:   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
817:   PetscReal       afill;
818:   PetscInt       *workj_L1, *workj_L2, *workj_L3;
819:   PetscInt        L1_nnz, L2_nnz;

821:   /* Step 1: Get upper bound on memory required for allocation.
822:              Because of the way virtual memory works,
823:              only the memory pages that are actually needed will be physically allocated. */
824:   PetscFunctionBegin;
825:   PetscCall(PetscMalloc1(am + 1, &ci));
826:   for (i = 0; i < am; i++) {
827:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
828:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
829:     a_rownnz             = 0;
830:     for (k = 0; k < anzi; ++k) {
831:       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
832:       if (a_rownnz > bn) {
833:         a_rownnz = bn;
834:         break;
835:       }
836:     }
837:     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
838:   }
839:   /* temporary work areas for merging rows */
840:   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
841:   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
842:   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));

844:   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
845:   c_maxmem = 8 * (ai[am] + bi[bm]);
846:   /* Step 2: Populate pattern for C */
847:   PetscCall(PetscMalloc1(c_maxmem, &cj));

849:   ci_nnz      = 0;
850:   ci[0]       = 0;
851:   worki_L1[0] = 0;
852:   worki_L2[0] = 0;
853:   for (i = 0; i < am; i++) {
854:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
856:     rowsleft             = anzi;
857:     inputcol_L1          = acol;
858:     L2_nnz               = 0;
859:     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
860:     worki_L2[1]          = 0;
861:     outputi_nnz          = 0;

863:     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
864:     while (ci_nnz + a_maxrownnz > c_maxmem) {
865:       c_maxmem *= 2;
866:       ndouble++;
867:       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
868:     }

870:     while (rowsleft) {
871:       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
872:       L1_nrows    = 0;
873:       L1_nnz      = 0;
874:       inputcol    = inputcol_L1;
875:       inputi      = bi;
876:       inputj      = bj;

878:       /* The following macro is used to specialize for small rows in A.
879:          This helps with compiler unrolling, improving performance substantially.
880:           Input:  inputj   inputi  inputcol  bn
881:           Output: outputj  outputi_nnz                       */
882: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
883:   do { \
884:     window_min  = bn; \
885:     outputi_nnz = 0; \
886:     for (k = 0; k < ANNZ; ++k) { \
887:       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
888:       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
889:       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
890:       window_min  = PetscMin(window[k], window_min); \
891:     } \
892:     while (window_min < bn) { \
893:       outputj[outputi_nnz++] = window_min; \
894:       /* advance front and compute new minimum */ \
895:       old_window_min = window_min; \
896:       window_min     = bn; \
897:       for (k = 0; k < ANNZ; ++k) { \
898:         if (window[k] == old_window_min) { \
899:           brow_ptr[k]++; \
900:           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
901:         } \
902:         window_min = PetscMin(window[k], window_min); \
903:       } \
904:     } \
905:   } while (0)

907:       /************** L E V E L  1 ***************/
908:       /* Merge up to 8 rows of B to L1 work array*/
909:       while (L1_rowsleft) {
910:         outputi_nnz = 0;
911:         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
912:         else outputj = cj + ci_nnz;                /* Merge directly to C */

914:         switch (L1_rowsleft) {
915:         case 1:
916:           brow_ptr[0] = inputj + inputi[inputcol[0]];
917:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
918:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
919:           inputcol += L1_rowsleft;
920:           rowsleft -= L1_rowsleft;
921:           L1_rowsleft = 0;
922:           break;
923:         case 2:
924:           MatMatMultSymbolic_RowMergeMacro(2);
925:           inputcol += L1_rowsleft;
926:           rowsleft -= L1_rowsleft;
927:           L1_rowsleft = 0;
928:           break;
929:         case 3:
930:           MatMatMultSymbolic_RowMergeMacro(3);
931:           inputcol += L1_rowsleft;
932:           rowsleft -= L1_rowsleft;
933:           L1_rowsleft = 0;
934:           break;
935:         case 4:
936:           MatMatMultSymbolic_RowMergeMacro(4);
937:           inputcol += L1_rowsleft;
938:           rowsleft -= L1_rowsleft;
939:           L1_rowsleft = 0;
940:           break;
941:         case 5:
942:           MatMatMultSymbolic_RowMergeMacro(5);
943:           inputcol += L1_rowsleft;
944:           rowsleft -= L1_rowsleft;
945:           L1_rowsleft = 0;
946:           break;
947:         case 6:
948:           MatMatMultSymbolic_RowMergeMacro(6);
949:           inputcol += L1_rowsleft;
950:           rowsleft -= L1_rowsleft;
951:           L1_rowsleft = 0;
952:           break;
953:         case 7:
954:           MatMatMultSymbolic_RowMergeMacro(7);
955:           inputcol += L1_rowsleft;
956:           rowsleft -= L1_rowsleft;
957:           L1_rowsleft = 0;
958:           break;
959:         default:
960:           MatMatMultSymbolic_RowMergeMacro(8);
961:           inputcol += 8;
962:           rowsleft -= 8;
963:           L1_rowsleft -= 8;
964:           break;
965:         }
966:         inputcol_L1 = inputcol;
967:         L1_nnz += outputi_nnz;
968:         worki_L1[++L1_nrows] = L1_nnz;
969:       }

971:       /********************** L E V E L  2 ************************/
972:       /* Merge from L1 work array to either C or to L2 work array */
973:       if (anzi > 8) {
974:         inputi      = worki_L1;
975:         inputj      = workj_L1;
976:         inputcol    = workcol;
977:         outputi_nnz = 0;

979:         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
980:         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */

982:         switch (L1_nrows) {
983:         case 1:
984:           brow_ptr[0] = inputj + inputi[inputcol[0]];
985:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
986:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
987:           break;
988:         case 2:
989:           MatMatMultSymbolic_RowMergeMacro(2);
990:           break;
991:         case 3:
992:           MatMatMultSymbolic_RowMergeMacro(3);
993:           break;
994:         case 4:
995:           MatMatMultSymbolic_RowMergeMacro(4);
996:           break;
997:         case 5:
998:           MatMatMultSymbolic_RowMergeMacro(5);
999:           break;
1000:         case 6:
1001:           MatMatMultSymbolic_RowMergeMacro(6);
1002:           break;
1003:         case 7:
1004:           MatMatMultSymbolic_RowMergeMacro(7);
1005:           break;
1006:         case 8:
1007:           MatMatMultSymbolic_RowMergeMacro(8);
1008:           break;
1009:         default:
1010:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1011:         }
1012:         L2_nnz += outputi_nnz;
1013:         worki_L2[++L2_nrows] = L2_nnz;

1015:         /************************ L E V E L  3 **********************/
1016:         /* Merge from L2 work array to either C or to L2 work array */
1017:         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1018:           inputi      = worki_L2;
1019:           inputj      = workj_L2;
1020:           inputcol    = workcol;
1021:           outputi_nnz = 0;
1022:           if (rowsleft) outputj = workj_L3;
1023:           else outputj = cj + ci_nnz;
1024:           switch (L2_nrows) {
1025:           case 1:
1026:             brow_ptr[0] = inputj + inputi[inputcol[0]];
1027:             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1028:             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1029:             break;
1030:           case 2:
1031:             MatMatMultSymbolic_RowMergeMacro(2);
1032:             break;
1033:           case 3:
1034:             MatMatMultSymbolic_RowMergeMacro(3);
1035:             break;
1036:           case 4:
1037:             MatMatMultSymbolic_RowMergeMacro(4);
1038:             break;
1039:           case 5:
1040:             MatMatMultSymbolic_RowMergeMacro(5);
1041:             break;
1042:           case 6:
1043:             MatMatMultSymbolic_RowMergeMacro(6);
1044:             break;
1045:           case 7:
1046:             MatMatMultSymbolic_RowMergeMacro(7);
1047:             break;
1048:           case 8:
1049:             MatMatMultSymbolic_RowMergeMacro(8);
1050:             break;
1051:           default:
1052:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1053:           }
1054:           L2_nrows    = 1;
1055:           L2_nnz      = outputi_nnz;
1056:           worki_L2[1] = outputi_nnz;
1057:           /* Copy to workj_L2 */
1058:           if (rowsleft) {
1059:             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1060:           }
1061:         }
1062:       }
1063:     } /* while (rowsleft) */
1064: #undef MatMatMultSymbolic_RowMergeMacro

1066:     /* terminate current row */
1067:     ci_nnz += outputi_nnz;
1068:     ci[i + 1] = ci_nnz;
1069:   }

1071:   /* Step 3: Create the new symbolic matrix */
1072:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1073:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

1075:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1076:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1077:   c          = (Mat_SeqAIJ *)C->data;
1078:   c->free_a  = PETSC_TRUE;
1079:   c->free_ij = PETSC_TRUE;
1080:   c->nonew   = 0;

1082:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1084:   /* set MatInfo */
1085:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1086:   if (afill < 1.0) afill = 1.0;
1087:   C->info.mallocs           = ndouble;
1088:   C->info.fill_ratio_given  = fill;
1089:   C->info.fill_ratio_needed = afill;

1091: #if defined(PETSC_USE_INFO)
1092:   if (ci[am]) {
1093:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1094:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1095:   } else {
1096:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1097:   }
1098: #endif

1100:   /* Step 4: Free temporary work areas */
1101:   PetscCall(PetscFree(workj_L1));
1102:   PetscCall(PetscFree(workj_L2));
1103:   PetscCall(PetscFree(workj_L3));
1104:   PetscFunctionReturn(PETSC_SUCCESS);
1105: }

1107: /* concatenate unique entries and then sort */
1108: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1109: {
1110:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1111:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1112:   PetscInt       *ci, *cj, bcol;
1113:   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1114:   PetscReal       afill;
1115:   PetscInt        i, j, ndouble = 0;
1116:   PetscSegBuffer  seg, segrow;
1117:   char           *seen;

1119:   PetscFunctionBegin;
1120:   PetscCall(PetscMalloc1(am + 1, &ci));
1121:   ci[0] = 0;

1123:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1124:   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1125:   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1126:   PetscCall(PetscCalloc1(bn, &seen));

1128:   /* Determine ci and cj */
1129:   for (i = 0; i < am; i++) {
1130:     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1131:     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1132:     PetscInt packlen     = 0, *PETSC_RESTRICT crow;

1134:     /* Pack segrow */
1135:     for (j = 0; j < anzi; j++) {
1136:       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1137:       for (k = bjstart; k < bjend; k++) {
1138:         bcol = bj[k];
1139:         if (!seen[bcol]) { /* new entry */
1140:           PetscInt *PETSC_RESTRICT slot;
1141:           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1142:           *slot      = bcol;
1143:           seen[bcol] = 1;
1144:           packlen++;
1145:         }
1146:       }
1147:     }

1149:     /* Check i-th diagonal entry */
1150:     if (C->force_diagonals && !seen[i]) {
1151:       PetscInt *PETSC_RESTRICT slot;
1152:       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1153:       *slot   = i;
1154:       seen[i] = 1;
1155:       packlen++;
1156:     }

1158:     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1159:     PetscCall(PetscSegBufferExtractTo(segrow, crow));
1160:     PetscCall(PetscSortInt(packlen, crow));
1161:     ci[i + 1] = ci[i] + packlen;
1162:     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1163:   }
1164:   PetscCall(PetscSegBufferDestroy(&segrow));
1165:   PetscCall(PetscFree(seen));

1167:   /* Column indices are in the segmented buffer */
1168:   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1169:   PetscCall(PetscSegBufferDestroy(&seg));

1171:   /* put together the new symbolic matrix */
1172:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1173:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

1175:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1176:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1177:   c          = (Mat_SeqAIJ *)C->data;
1178:   c->free_a  = PETSC_TRUE;
1179:   c->free_ij = PETSC_TRUE;
1180:   c->nonew   = 0;

1182:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1184:   /* set MatInfo */
1185:   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1186:   if (afill < 1.0) afill = 1.0;
1187:   C->info.mallocs           = ndouble;
1188:   C->info.fill_ratio_given  = fill;
1189:   C->info.fill_ratio_needed = afill;

1191: #if defined(PETSC_USE_INFO)
1192:   if (ci[am]) {
1193:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1194:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1195:   } else {
1196:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1197:   }
1198: #endif
1199:   PetscFunctionReturn(PETSC_SUCCESS);
1200: }

1202: static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1203: {
1204:   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;

1206:   PetscFunctionBegin;
1207:   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1208:   PetscCall(MatDestroy(&abt->Bt_den));
1209:   PetscCall(MatDestroy(&abt->ABt_den));
1210:   PetscCall(PetscFree(abt));
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1215: {
1216:   Mat                  Bt;
1217:   Mat_MatMatTransMult *abt;
1218:   Mat_Product         *product = C->product;
1219:   char                *alg;

1221:   PetscFunctionBegin;
1222:   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1223:   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");

1225:   /* create symbolic Bt */
1226:   PetscCall(MatTransposeSymbolic(B, &Bt));
1227:   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1228:   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));

1230:   /* get symbolic C=A*Bt */
1231:   PetscCall(PetscStrallocpy(product->alg, &alg));
1232:   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1233:   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1234:   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1235:   PetscCall(PetscFree(alg));

1237:   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1238:   PetscCall(PetscNew(&abt));

1240:   product->data    = abt;
1241:   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;

1243:   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;

1245:   abt->usecoloring = PETSC_FALSE;
1246:   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1247:   if (abt->usecoloring) {
1248:     /* Create MatTransposeColoring from symbolic C=A*B^T */
1249:     MatTransposeColoring matcoloring;
1250:     MatColoring          coloring;
1251:     ISColoring           iscoloring;
1252:     Mat                  Bt_dense, C_dense;

1254:     /* inode causes memory problem */
1255:     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));

1257:     PetscCall(MatColoringCreate(C, &coloring));
1258:     PetscCall(MatColoringSetDistance(coloring, 2));
1259:     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1260:     PetscCall(MatColoringSetFromOptions(coloring));
1261:     PetscCall(MatColoringApply(coloring, &iscoloring));
1262:     PetscCall(MatColoringDestroy(&coloring));
1263:     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));

1265:     abt->matcoloring = matcoloring;

1267:     PetscCall(ISColoringDestroy(&iscoloring));

1269:     /* Create Bt_dense and C_dense = A*Bt_dense */
1270:     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1271:     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1272:     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1273:     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));

1275:     Bt_dense->assembled = PETSC_TRUE;
1276:     abt->Bt_den         = Bt_dense;

1278:     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1279:     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1280:     PetscCall(MatSetType(C_dense, MATSEQDENSE));
1281:     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));

1283:     Bt_dense->assembled = PETSC_TRUE;
1284:     abt->ABt_den        = C_dense;

1286: #if defined(PETSC_USE_INFO)
1287:     {
1288:       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1289:       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1290:                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1291:     }
1292: #endif
1293:   }
1294:   /* clean up */
1295:   PetscCall(MatDestroy(&Bt));
1296:   PetscFunctionReturn(PETSC_SUCCESS);
1297: }

1299: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1300: {
1301:   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1302:   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1303:   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1304:   PetscLogDouble       flops = 0.0;
1305:   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1306:   Mat_MatMatTransMult *abt;
1307:   Mat_Product         *product = C->product;

1309:   PetscFunctionBegin;
1310:   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1311:   abt = (Mat_MatMatTransMult *)product->data;
1312:   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1313:   /* clear old values in C */
1314:   if (!c->a) {
1315:     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1316:     c->a      = ca;
1317:     c->free_a = PETSC_TRUE;
1318:   } else {
1319:     ca = c->a;
1320:     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1321:   }

1323:   if (abt->usecoloring) {
1324:     MatTransposeColoring matcoloring = abt->matcoloring;
1325:     Mat                  Bt_dense, C_dense = abt->ABt_den;

1327:     /* Get Bt_dense by Apply MatTransposeColoring to B */
1328:     Bt_dense = abt->Bt_den;
1329:     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));

1331:     /* C_dense = A*Bt_dense */
1332:     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));

1334:     /* Recover C from C_dense */
1335:     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1336:     PetscFunctionReturn(PETSC_SUCCESS);
1337:   }

1339:   for (i = 0; i < cm; i++) {
1340:     anzi = ai[i + 1] - ai[i];
1341:     acol = PetscSafePointerPlusOffset(aj, ai[i]);
1342:     aval = PetscSafePointerPlusOffset(aa, ai[i]);
1343:     cnzi = ci[i + 1] - ci[i];
1344:     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
1345:     cval = ca + ci[i];
1346:     for (j = 0; j < cnzi; j++) {
1347:       brow = ccol[j];
1348:       bnzj = bi[brow + 1] - bi[brow];
1349:       bcol = bj + bi[brow];
1350:       bval = ba + bi[brow];

1352:       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1353:       nexta = 0;
1354:       nextb = 0;
1355:       while (nexta < anzi && nextb < bnzj) {
1356:         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1357:         if (nexta == anzi) break;
1358:         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1359:         if (nextb == bnzj) break;
1360:         if (acol[nexta] == bcol[nextb]) {
1361:           cval[j] += aval[nexta] * bval[nextb];
1362:           nexta++;
1363:           nextb++;
1364:           flops += 2;
1365:         }
1366:       }
1367:     }
1368:   }
1369:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1370:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1371:   PetscCall(PetscLogFlops(flops));
1372:   PetscFunctionReturn(PETSC_SUCCESS);
1373: }

1375: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1376: {
1377:   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;

1379:   PetscFunctionBegin;
1380:   PetscCall(MatDestroy(&atb->At));
1381:   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1382:   PetscCall(PetscFree(atb));
1383:   PetscFunctionReturn(PETSC_SUCCESS);
1384: }

1386: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1387: {
1388:   Mat          At      = NULL;
1389:   Mat_Product *product = C->product;
1390:   PetscBool    flg, def, square;

1392:   PetscFunctionBegin;
1393:   MatCheckProduct(C, 4);
1394:   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1395:   /* outerproduct */
1396:   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1397:   if (flg) {
1398:     /* create symbolic At */
1399:     if (!square) {
1400:       PetscCall(MatTransposeSymbolic(A, &At));
1401:       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1402:       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1403:     }
1404:     /* get symbolic C=At*B */
1405:     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1406:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));

1408:     /* clean up */
1409:     if (!square) PetscCall(MatDestroy(&At));

1411:     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1412:     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1413:     PetscFunctionReturn(PETSC_SUCCESS);
1414:   }

1416:   /* matmatmult */
1417:   PetscCall(PetscStrcmp(product->alg, "default", &def));
1418:   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1419:   if (flg || def) {
1420:     Mat_MatTransMatMult *atb;

1422:     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1423:     PetscCall(PetscNew(&atb));
1424:     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1425:     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1426:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1427:     PetscCall(MatProductSetAlgorithm(C, "at*b"));
1428:     product->data    = atb;
1429:     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1430:     atb->At          = At;

1432:     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1433:     PetscFunctionReturn(PETSC_SUCCESS);
1434:   }

1436:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1437: }

1439: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1440: {
1441:   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1442:   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1443:   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1444:   PetscLogDouble flops = 0.0;
1445:   MatScalar     *aa    = a->a, *ba, *ca, *caj;

1447:   PetscFunctionBegin;
1448:   if (!c->a) {
1449:     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));

1451:     c->a      = ca;
1452:     c->free_a = PETSC_TRUE;
1453:   } else {
1454:     ca = c->a;
1455:     PetscCall(PetscArrayzero(ca, ci[cm]));
1456:   }

1458:   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1459:   for (i = 0; i < am; i++) {
1460:     bj   = b->j + bi[i];
1461:     ba   = b->a + bi[i];
1462:     bnzi = bi[i + 1] - bi[i];
1463:     anzi = ai[i + 1] - ai[i];
1464:     for (j = 0; j < anzi; j++) {
1465:       nextb = 0;
1466:       crow  = *aj++;
1467:       cjj   = cj + ci[crow];
1468:       caj   = ca + ci[crow];
1469:       /* perform sparse axpy operation.  Note cjj includes bj. */
1470:       for (k = 0; nextb < bnzi; k++) {
1471:         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1472:           caj[k] += (*aa) * (*(ba + nextb));
1473:           nextb++;
1474:         }
1475:       }
1476:       flops += 2 * bnzi;
1477:       aa++;
1478:     }
1479:   }

1481:   /* Assemble the final matrix and clean up */
1482:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1483:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1484:   PetscCall(PetscLogFlops(flops));
1485:   PetscFunctionReturn(PETSC_SUCCESS);
1486: }

1488: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1489: {
1490:   PetscFunctionBegin;
1491:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1492:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1493:   PetscFunctionReturn(PETSC_SUCCESS);
1494: }

1496: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1497: {
1498:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
1499:   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1500:   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1501:   const PetscInt    *aj;
1502:   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1503:   PetscInt           clda;
1504:   PetscInt           am4, bm4, col, i, j, n;

1506:   PetscFunctionBegin;
1507:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1508:   PetscCall(MatSeqAIJGetArrayRead(A, &av));
1509:   if (add) {
1510:     PetscCall(MatDenseGetArray(C, &c));
1511:   } else {
1512:     PetscCall(MatDenseGetArrayWrite(C, &c));
1513:   }
1514:   PetscCall(MatDenseGetArrayRead(B, &b));
1515:   PetscCall(MatDenseGetLDA(B, &bm));
1516:   PetscCall(MatDenseGetLDA(C, &clda));
1517:   am4 = 4 * clda;
1518:   bm4 = 4 * bm;
1519:   if (b) {
1520:     b1 = b;
1521:     b2 = b1 + bm;
1522:     b3 = b2 + bm;
1523:     b4 = b3 + bm;
1524:   } else b1 = b2 = b3 = b4 = NULL;
1525:   c1 = c;
1526:   c2 = c1 + clda;
1527:   c3 = c2 + clda;
1528:   c4 = c3 + clda;
1529:   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1530:     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1531:       r1 = r2 = r3 = r4 = 0.0;
1532:       n                 = a->i[i + 1] - a->i[i];
1533:       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
1534:       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1535:       for (j = 0; j < n; j++) {
1536:         const PetscScalar aatmp = aa[j];
1537:         const PetscInt    ajtmp = aj[j];
1538:         r1 += aatmp * b1[ajtmp];
1539:         r2 += aatmp * b2[ajtmp];
1540:         r3 += aatmp * b3[ajtmp];
1541:         r4 += aatmp * b4[ajtmp];
1542:       }
1543:       if (add) {
1544:         c1[i] += r1;
1545:         c2[i] += r2;
1546:         c3[i] += r3;
1547:         c4[i] += r4;
1548:       } else {
1549:         c1[i] = r1;
1550:         c2[i] = r2;
1551:         c3[i] = r3;
1552:         c4[i] = r4;
1553:       }
1554:     }
1555:     if (b) {
1556:       b1 += bm4;
1557:       b2 += bm4;
1558:       b3 += bm4;
1559:       b4 += bm4;
1560:     }
1561:     c1 += am4;
1562:     c2 += am4;
1563:     c3 += am4;
1564:     c4 += am4;
1565:   }
1566:   /* process remaining columns */
1567:   if (col != cn) {
1568:     PetscInt rc = cn - col;

1570:     if (rc == 1) {
1571:       for (i = 0; i < am; i++) {
1572:         r1 = 0.0;
1573:         n  = a->i[i + 1] - a->i[i];
1574:         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1575:         aa = PetscSafePointerPlusOffset(av, a->i[i]);
1576:         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1577:         if (add) c1[i] += r1;
1578:         else c1[i] = r1;
1579:       }
1580:     } else if (rc == 2) {
1581:       for (i = 0; i < am; i++) {
1582:         r1 = r2 = 0.0;
1583:         n       = a->i[i + 1] - a->i[i];
1584:         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
1585:         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1586:         for (j = 0; j < n; j++) {
1587:           const PetscScalar aatmp = aa[j];
1588:           const PetscInt    ajtmp = aj[j];
1589:           r1 += aatmp * b1[ajtmp];
1590:           r2 += aatmp * b2[ajtmp];
1591:         }
1592:         if (add) {
1593:           c1[i] += r1;
1594:           c2[i] += r2;
1595:         } else {
1596:           c1[i] = r1;
1597:           c2[i] = r2;
1598:         }
1599:       }
1600:     } else {
1601:       for (i = 0; i < am; i++) {
1602:         r1 = r2 = r3 = 0.0;
1603:         n            = a->i[i + 1] - a->i[i];
1604:         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
1605:         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
1606:         for (j = 0; j < n; j++) {
1607:           const PetscScalar aatmp = aa[j];
1608:           const PetscInt    ajtmp = aj[j];
1609:           r1 += aatmp * b1[ajtmp];
1610:           r2 += aatmp * b2[ajtmp];
1611:           r3 += aatmp * b3[ajtmp];
1612:         }
1613:         if (add) {
1614:           c1[i] += r1;
1615:           c2[i] += r2;
1616:           c3[i] += r3;
1617:         } else {
1618:           c1[i] = r1;
1619:           c2[i] = r2;
1620:           c3[i] = r3;
1621:         }
1622:       }
1623:     }
1624:   }
1625:   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1626:   if (add) {
1627:     PetscCall(MatDenseRestoreArray(C, &c));
1628:   } else {
1629:     PetscCall(MatDenseRestoreArrayWrite(C, &c));
1630:   }
1631:   PetscCall(MatDenseRestoreArrayRead(B, &b));
1632:   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1633:   PetscFunctionReturn(PETSC_SUCCESS);
1634: }

1636: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1637: {
1638:   PetscFunctionBegin;
1639:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1640:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1641:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);

1643:   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1644:   PetscFunctionReturn(PETSC_SUCCESS);
1645: }

1647: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1648: {
1649:   PetscFunctionBegin;
1650:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1651:   C->ops->productsymbolic = MatProductSymbolic_AB;
1652:   PetscFunctionReturn(PETSC_SUCCESS);
1653: }

1655: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);

1657: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1658: {
1659:   PetscFunctionBegin;
1660:   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1661:   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1662:   PetscFunctionReturn(PETSC_SUCCESS);
1663: }

1665: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1666: {
1667:   PetscFunctionBegin;
1668:   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1669:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1670:   PetscFunctionReturn(PETSC_SUCCESS);
1671: }

1673: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1674: {
1675:   Mat_Product *product = C->product;

1677:   PetscFunctionBegin;
1678:   switch (product->type) {
1679:   case MATPRODUCT_AB:
1680:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1681:     break;
1682:   case MATPRODUCT_AtB:
1683:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1684:     break;
1685:   case MATPRODUCT_ABt:
1686:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1687:     break;
1688:   default:
1689:     break;
1690:   }
1691:   PetscFunctionReturn(PETSC_SUCCESS);
1692: }

1694: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1695: {
1696:   Mat_Product *product = C->product;
1697:   Mat          A       = product->A;
1698:   PetscBool    baij;

1700:   PetscFunctionBegin;
1701:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1702:   if (!baij) { /* A is seqsbaij */
1703:     PetscBool sbaij;
1704:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1705:     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");

1707:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1708:   } else { /* A is seqbaij */
1709:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1710:   }

1712:   C->ops->productsymbolic = MatProductSymbolic_AB;
1713:   PetscFunctionReturn(PETSC_SUCCESS);
1714: }

1716: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1717: {
1718:   Mat_Product *product = C->product;

1720:   PetscFunctionBegin;
1721:   MatCheckProduct(C, 1);
1722:   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1723:   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1724:   PetscFunctionReturn(PETSC_SUCCESS);
1725: }

1727: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1728: {
1729:   PetscFunctionBegin;
1730:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1731:   C->ops->productsymbolic = MatProductSymbolic_AB;
1732:   PetscFunctionReturn(PETSC_SUCCESS);
1733: }

1735: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1736: {
1737:   Mat_Product *product = C->product;

1739:   PetscFunctionBegin;
1740:   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1741:   PetscFunctionReturn(PETSC_SUCCESS);
1742: }

1744: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1745: {
1746:   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
1747:   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1748:   PetscInt     *bi = b->i, *bj = b->j;
1749:   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1750:   MatScalar    *btval, *btval_den, *ba = b->a;
1751:   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;

1753:   PetscFunctionBegin;
1754:   btval_den = btdense->v;
1755:   PetscCall(PetscArrayzero(btval_den, m * n));
1756:   for (k = 0; k < ncolors; k++) {
1757:     ncolumns = coloring->ncolumns[k];
1758:     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1759:       col   = *(columns + colorforcol[k] + l);
1760:       btcol = bj + bi[col];
1761:       btval = ba + bi[col];
1762:       anz   = bi[col + 1] - bi[col];
1763:       for (j = 0; j < anz; j++) {
1764:         brow            = btcol[j];
1765:         btval_den[brow] = btval[j];
1766:       }
1767:     }
1768:     btval_den += m;
1769:   }
1770:   PetscFunctionReturn(PETSC_SUCCESS);
1771: }

1773: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1774: {
1775:   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
1776:   const PetscScalar *ca_den, *ca_den_ptr;
1777:   PetscScalar       *ca = csp->a;
1778:   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1779:   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1780:   PetscInt           nrows, *row, *idx;
1781:   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;

1783:   PetscFunctionBegin;
1784:   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));

1786:   if (brows > 0) {
1787:     PetscInt *lstart, row_end, row_start;
1788:     lstart = matcoloring->lstart;
1789:     PetscCall(PetscArrayzero(lstart, ncolors));

1791:     row_end = brows;
1792:     if (row_end > m) row_end = m;
1793:     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1794:       ca_den_ptr = ca_den;
1795:       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1796:         nrows = matcoloring->nrows[k];
1797:         row   = rows + colorforrow[k];
1798:         idx   = den2sp + colorforrow[k];
1799:         for (l = lstart[k]; l < nrows; l++) {
1800:           if (row[l] >= row_end) {
1801:             lstart[k] = l;
1802:             break;
1803:           } else {
1804:             ca[idx[l]] = ca_den_ptr[row[l]];
1805:           }
1806:         }
1807:         ca_den_ptr += m;
1808:       }
1809:       row_end += brows;
1810:       if (row_end > m) row_end = m;
1811:     }
1812:   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1813:     ca_den_ptr = ca_den;
1814:     for (k = 0; k < ncolors; k++) {
1815:       nrows = matcoloring->nrows[k];
1816:       row   = rows + colorforrow[k];
1817:       idx   = den2sp + colorforrow[k];
1818:       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1819:       ca_den_ptr += m;
1820:     }
1821:   }

1823:   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1824: #if defined(PETSC_USE_INFO)
1825:   if (matcoloring->brows > 0) {
1826:     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1827:   } else {
1828:     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1829:   }
1830: #endif
1831:   PetscFunctionReturn(PETSC_SUCCESS);
1832: }

1834: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1835: {
1836:   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1837:   const PetscInt *is, *ci, *cj, *row_idx;
1838:   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1839:   IS             *isa;
1840:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1841:   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1842:   PetscInt       *colorforcol, *columns, *columns_i, brows;
1843:   PetscBool       flg;

1845:   PetscFunctionBegin;
1846:   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));

1848:   /* bs >1 is not being tested yet! */
1849:   Nbs       = mat->cmap->N / bs;
1850:   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1851:   c->N      = Nbs;
1852:   c->m      = c->M;
1853:   c->rstart = 0;
1854:   c->brows  = 100;

1856:   c->ncolors = nis;
1857:   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1858:   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1859:   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));

1861:   brows = c->brows;
1862:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1863:   if (flg) c->brows = brows;
1864:   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));

1866:   colorforrow[0] = 0;
1867:   rows_i         = rows;
1868:   den2sp_i       = den2sp;

1870:   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1871:   PetscCall(PetscMalloc1(Nbs + 1, &columns));

1873:   colorforcol[0] = 0;
1874:   columns_i      = columns;

1876:   /* get column-wise storage of mat */
1877:   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));

1879:   cm = c->m;
1880:   PetscCall(PetscMalloc1(cm + 1, &rowhit));
1881:   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1882:   for (i = 0; i < nis; i++) { /* loop over color */
1883:     PetscCall(ISGetLocalSize(isa[i], &n));
1884:     PetscCall(ISGetIndices(isa[i], &is));

1886:     c->ncolumns[i] = n;
1887:     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1888:     colorforcol[i + 1] = colorforcol[i] + n;
1889:     columns_i += n;

1891:     /* fast, crude version requires O(N*N) work */
1892:     PetscCall(PetscArrayzero(rowhit, cm));

1894:     for (j = 0; j < n; j++) { /* loop over columns*/
1895:       col     = is[j];
1896:       row_idx = cj + ci[col];
1897:       m       = ci[col + 1] - ci[col];
1898:       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1899:         idxhit[*row_idx]   = spidx[ci[col] + k];
1900:         rowhit[*row_idx++] = col + 1;
1901:       }
1902:     }
1903:     /* count the number of hits */
1904:     nrows = 0;
1905:     for (j = 0; j < cm; j++) {
1906:       if (rowhit[j]) nrows++;
1907:     }
1908:     c->nrows[i]        = nrows;
1909:     colorforrow[i + 1] = colorforrow[i] + nrows;

1911:     nrows = 0;
1912:     for (j = 0; j < cm; j++) { /* loop over rows */
1913:       if (rowhit[j]) {
1914:         rows_i[nrows]   = j;
1915:         den2sp_i[nrows] = idxhit[j];
1916:         nrows++;
1917:       }
1918:     }
1919:     den2sp_i += nrows;

1921:     PetscCall(ISRestoreIndices(isa[i], &is));
1922:     rows_i += nrows;
1923:   }
1924:   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1925:   PetscCall(PetscFree(rowhit));
1926:   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1927:   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);

1929:   c->colorforrow = colorforrow;
1930:   c->rows        = rows;
1931:   c->den2sp      = den2sp;
1932:   c->colorforcol = colorforcol;
1933:   c->columns     = columns;

1935:   PetscCall(PetscFree(idxhit));
1936:   PetscFunctionReturn(PETSC_SUCCESS);
1937: }

1939: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1940: {
1941:   Mat_Product *product = C->product;
1942:   Mat          A = product->A, B = product->B;

1944:   PetscFunctionBegin;
1945:   if (C->ops->mattransposemultnumeric) {
1946:     /* Alg: "outerproduct" */
1947:     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1948:   } else {
1949:     /* Alg: "matmatmult" -- C = At*B */
1950:     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;

1952:     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1953:     if (atb->At) {
1954:       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1955:          user may have called MatProductReplaceMats() to get this A=product->A */
1956:       PetscCall(MatTransposeSetPrecursor(A, atb->At));
1957:       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1958:     }
1959:     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1960:   }
1961:   PetscFunctionReturn(PETSC_SUCCESS);
1962: }

1964: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1965: {
1966:   Mat_Product *product = C->product;
1967:   Mat          A = product->A, B = product->B;
1968:   PetscReal    fill = product->fill;

1970:   PetscFunctionBegin;
1971:   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));

1973:   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1974:   PetscFunctionReturn(PETSC_SUCCESS);
1975: }

1977: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1978: {
1979:   Mat_Product *product = C->product;
1980:   PetscInt     alg     = 0; /* default algorithm */
1981:   PetscBool    flg     = PETSC_FALSE;
1982: #if !defined(PETSC_HAVE_HYPRE)
1983:   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1984:   PetscInt    nalg        = 7;
1985: #else
1986:   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1987:   PetscInt    nalg        = 8;
1988: #endif

1990:   PetscFunctionBegin;
1991:   /* Set default algorithm */
1992:   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
1993:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

1995:   /* Get runtime option */
1996:   if (product->api_user) {
1997:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
1998:     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
1999:     PetscOptionsEnd();
2000:   } else {
2001:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2002:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2003:     PetscOptionsEnd();
2004:   }
2005:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2007:   C->ops->productsymbolic = MatProductSymbolic_AB;
2008:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2009:   PetscFunctionReturn(PETSC_SUCCESS);
2010: }

2012: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2013: {
2014:   Mat_Product *product     = C->product;
2015:   PetscInt     alg         = 0; /* default algorithm */
2016:   PetscBool    flg         = PETSC_FALSE;
2017:   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2018:   PetscInt     nalg        = 3;

2020:   PetscFunctionBegin;
2021:   /* Get runtime option */
2022:   if (product->api_user) {
2023:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2024:     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2025:     PetscOptionsEnd();
2026:   } else {
2027:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2028:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2029:     PetscOptionsEnd();
2030:   }
2031:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2033:   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2034:   PetscFunctionReturn(PETSC_SUCCESS);
2035: }

2037: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2038: {
2039:   Mat_Product *product     = C->product;
2040:   PetscInt     alg         = 0; /* default algorithm */
2041:   PetscBool    flg         = PETSC_FALSE;
2042:   const char  *algTypes[2] = {"default", "color"};
2043:   PetscInt     nalg        = 2;

2045:   PetscFunctionBegin;
2046:   /* Set default algorithm */
2047:   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2048:   if (!flg) {
2049:     alg = 1;
2050:     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2051:   }

2053:   /* Get runtime option */
2054:   if (product->api_user) {
2055:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2056:     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2057:     PetscOptionsEnd();
2058:   } else {
2059:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2060:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2061:     PetscOptionsEnd();
2062:   }
2063:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2065:   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2066:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2067:   PetscFunctionReturn(PETSC_SUCCESS);
2068: }

2070: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2071: {
2072:   Mat_Product *product = C->product;
2073:   PetscBool    flg     = PETSC_FALSE;
2074:   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
2075: #if !defined(PETSC_HAVE_HYPRE)
2076:   const char *algTypes[2] = {"scalable", "rap"};
2077:   PetscInt    nalg        = 2;
2078: #else
2079:   const char *algTypes[3] = {"scalable", "rap", "hypre"};
2080:   PetscInt    nalg        = 3;
2081: #endif

2083:   PetscFunctionBegin;
2084:   /* Set default algorithm */
2085:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2086:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2088:   /* Get runtime option */
2089:   if (product->api_user) {
2090:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2091:     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2092:     PetscOptionsEnd();
2093:   } else {
2094:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2095:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2096:     PetscOptionsEnd();
2097:   }
2098:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2100:   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2101:   PetscFunctionReturn(PETSC_SUCCESS);
2102: }

2104: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2105: {
2106:   Mat_Product *product     = C->product;
2107:   PetscBool    flg         = PETSC_FALSE;
2108:   PetscInt     alg         = 0; /* default algorithm */
2109:   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2110:   PetscInt     nalg        = 3;

2112:   PetscFunctionBegin;
2113:   /* Set default algorithm */
2114:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2115:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2117:   /* Get runtime option */
2118:   if (product->api_user) {
2119:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2120:     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2121:     PetscOptionsEnd();
2122:   } else {
2123:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2124:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2125:     PetscOptionsEnd();
2126:   }
2127:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2129:   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2130:   PetscFunctionReturn(PETSC_SUCCESS);
2131: }

2133: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2134: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2135: {
2136:   Mat_Product *product     = C->product;
2137:   PetscInt     alg         = 0; /* default algorithm */
2138:   PetscBool    flg         = PETSC_FALSE;
2139:   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2140:   PetscInt     nalg        = 7;

2142:   PetscFunctionBegin;
2143:   /* Set default algorithm */
2144:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2145:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2147:   /* Get runtime option */
2148:   if (product->api_user) {
2149:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2150:     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2151:     PetscOptionsEnd();
2152:   } else {
2153:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2154:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2155:     PetscOptionsEnd();
2156:   }
2157:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2159:   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2160:   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2161:   PetscFunctionReturn(PETSC_SUCCESS);
2162: }

2164: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2165: {
2166:   Mat_Product *product = C->product;

2168:   PetscFunctionBegin;
2169:   switch (product->type) {
2170:   case MATPRODUCT_AB:
2171:     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2172:     break;
2173:   case MATPRODUCT_AtB:
2174:     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2175:     break;
2176:   case MATPRODUCT_ABt:
2177:     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2178:     break;
2179:   case MATPRODUCT_PtAP:
2180:     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2181:     break;
2182:   case MATPRODUCT_RARt:
2183:     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2184:     break;
2185:   case MATPRODUCT_ABC:
2186:     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2187:     break;
2188:   default:
2189:     break;
2190:   }
2191:   PetscFunctionReturn(PETSC_SUCCESS);
2192: }