Actual source code: matmatmult.c
1: /*
2: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3: C = A * B
4: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/utils/freespace.h>
8: #include <petscbt.h>
9: #include <petsc/private/isimpl.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
12: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13: {
14: PetscFunctionBegin;
15: if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16: else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
17: PetscFunctionReturn(PETSC_SUCCESS);
18: }
20: /* Modified from MatCreateSeqAIJWithArrays() */
21: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22: {
23: PetscInt ii;
24: Mat_SeqAIJ *aij;
25: PetscBool isseqaij, ofree_a, ofree_ij;
27: PetscFunctionBegin;
28: PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
29: PetscCall(MatSetSizes(mat, m, n, m, n));
31: if (!mtype) {
32: PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
33: if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34: } else {
35: PetscCall(MatSetType(mat, mtype));
36: }
38: aij = (Mat_SeqAIJ *)mat->data;
39: ofree_a = aij->free_a;
40: ofree_ij = aij->free_ij;
41: /* changes the free flags */
42: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
44: PetscCall(PetscFree(aij->ilen));
45: PetscCall(PetscFree(aij->imax));
46: PetscCall(PetscMalloc1(m, &aij->imax));
47: PetscCall(PetscMalloc1(m, &aij->ilen));
48: for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49: const PetscInt rnz = i[ii + 1] - i[ii];
50: aij->nonzerorowcnt += !!rnz;
51: aij->rmax = PetscMax(aij->rmax, rnz);
52: aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53: }
54: aij->maxnz = i[m];
55: aij->nz = i[m];
57: if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
58: if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
59: if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
61: aij->i = i;
62: aij->j = j;
63: aij->a = a;
64: aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
65: aij->free_a = PETSC_FALSE;
66: aij->free_ij = PETSC_FALSE;
67: PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
68: // Always build the diag info when i, j are set
69: PetscFunctionReturn(PETSC_SUCCESS);
70: }
72: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
73: {
74: Mat_Product *product = C->product;
75: MatProductAlgorithm alg;
76: PetscBool flg;
78: PetscFunctionBegin;
79: if (product) {
80: alg = product->alg;
81: } else {
82: alg = "sorted";
83: }
84: /* sorted */
85: PetscCall(PetscStrcmp(alg, "sorted", &flg));
86: if (flg) {
87: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
88: PetscFunctionReturn(PETSC_SUCCESS);
89: }
91: /* scalable */
92: PetscCall(PetscStrcmp(alg, "scalable", &flg));
93: if (flg) {
94: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
95: PetscFunctionReturn(PETSC_SUCCESS);
96: }
98: /* scalable_fast */
99: PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
100: if (flg) {
101: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
102: PetscFunctionReturn(PETSC_SUCCESS);
103: }
105: /* heap */
106: PetscCall(PetscStrcmp(alg, "heap", &flg));
107: if (flg) {
108: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
109: PetscFunctionReturn(PETSC_SUCCESS);
110: }
112: /* btheap */
113: PetscCall(PetscStrcmp(alg, "btheap", &flg));
114: if (flg) {
115: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
116: PetscFunctionReturn(PETSC_SUCCESS);
117: }
119: /* llcondensed */
120: PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
121: if (flg) {
122: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: /* rowmerge */
127: PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
128: if (flg) {
129: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
130: PetscFunctionReturn(PETSC_SUCCESS);
131: }
133: #if defined(PETSC_HAVE_HYPRE)
134: PetscCall(PetscStrcmp(alg, "hypre", &flg));
135: if (flg) {
136: PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
139: #endif
141: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
142: }
144: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
145: {
146: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
147: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
148: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
149: PetscReal afill;
150: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
151: PetscHMapI ta;
152: PetscBT lnkbt;
153: PetscFreeSpaceList free_space = NULL, current_space = NULL;
155: PetscFunctionBegin;
156: /* Get ci and cj */
157: /* Allocate ci array, arrays for fill computation and */
158: /* free space for accumulating nonzero column info */
159: PetscCall(PetscMalloc1(am + 2, &ci));
160: ci[0] = 0;
162: /* create and initialize a linked list */
163: PetscCall(PetscHMapICreateWithSize(bn, &ta));
164: MatRowMergeMax_SeqAIJ(b, bm, ta);
165: PetscCall(PetscHMapIGetSize(ta, &Crmax));
166: PetscCall(PetscHMapIDestroy(&ta));
168: PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
170: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
171: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
173: current_space = free_space;
175: /* Determine ci and cj */
176: for (i = 0; i < am; i++) {
177: anzi = ai[i + 1] - ai[i];
178: aj = a->j + ai[i];
179: for (j = 0; j < anzi; j++) {
180: brow = aj[j];
181: bnzj = bi[brow + 1] - bi[brow];
182: bj = b->j + bi[brow];
183: /* add non-zero cols of B into the sorted linked list lnk */
184: PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
185: }
186: /* add possible missing diagonal entry */
187: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
188: cnzi = lnk[0];
190: /* If free space is not available, make more free space */
191: /* Double the amount of total space in the list */
192: if (current_space->local_remaining < cnzi) {
193: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
194: ndouble++;
195: }
197: /* Copy data into free space, then initialize lnk */
198: PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
200: current_space->array += cnzi;
201: current_space->local_used += cnzi;
202: current_space->local_remaining -= cnzi;
204: ci[i + 1] = ci[i] + cnzi;
205: }
207: /* Column indices are in the list of free space */
208: /* Allocate space for cj, initialize cj, and */
209: /* destroy list of free space and other temporary array(s) */
210: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
211: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
212: PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
214: /* put together the new symbolic matrix */
215: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
216: PetscCall(MatSetBlockSizesFromMats(C, A, B));
218: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
219: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
220: c = (Mat_SeqAIJ *)C->data;
221: c->free_a = PETSC_FALSE;
222: c->free_ij = PETSC_TRUE;
223: c->nonew = 0;
225: /* fast, needs non-scalable O(bn) array 'abdense' */
226: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
228: /* set MatInfo */
229: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
230: if (afill < 1.0) afill = 1.0;
231: C->info.mallocs = ndouble;
232: C->info.fill_ratio_given = fill;
233: C->info.fill_ratio_needed = afill;
235: #if defined(PETSC_USE_INFO)
236: if (ci[am]) {
237: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
238: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
239: } else {
240: PetscCall(PetscInfo(C, "Empty matrix product\n"));
241: }
242: #endif
243: PetscFunctionReturn(PETSC_SUCCESS);
244: }
246: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
247: {
248: PetscLogDouble flops = 0.0;
249: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
250: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
251: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
252: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
253: PetscInt am = A->rmap->n, cm = C->rmap->n;
254: PetscInt i, j, k, anzi, bnzi, cnzi, brow;
255: PetscScalar *ca, valtmp;
256: PetscScalar *ab_dense;
257: PetscContainer cab_dense;
258: const PetscScalar *aa, *ba, *baj;
260: PetscFunctionBegin;
261: PetscCall(MatSeqAIJGetArrayRead(A, &aa));
262: PetscCall(MatSeqAIJGetArrayRead(B, &ba));
263: if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
264: PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
265: c->a = ca;
266: c->free_a = PETSC_TRUE;
267: } else ca = c->a;
269: /* TODO this should be done in the symbolic phase */
270: /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
271: that is hard to eradicate) */
272: PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
273: if (!cab_dense) {
274: PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
275: PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault));
276: } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
277: PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
279: /* clean old values in C */
280: PetscCall(PetscArrayzero(ca, ci[cm]));
281: /* Traverse A row-wise. */
282: /* Build the ith row in C by summing over nonzero columns in A, */
283: /* the rows of B corresponding to nonzeros of A. */
284: for (i = 0; i < am; i++) {
285: anzi = ai[i + 1] - ai[i];
286: for (j = 0; j < anzi; j++) {
287: brow = aj[j];
288: bnzi = bi[brow + 1] - bi[brow];
289: bjj = PetscSafePointerPlusOffset(bj, bi[brow]);
290: baj = PetscSafePointerPlusOffset(ba, bi[brow]);
291: /* perform dense axpy */
292: valtmp = aa[j];
293: for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
294: flops += 2 * bnzi;
295: }
296: aj = PetscSafePointerPlusOffset(aj, anzi);
297: aa = PetscSafePointerPlusOffset(aa, anzi);
299: cnzi = ci[i + 1] - ci[i];
300: for (k = 0; k < cnzi; k++) {
301: ca[k] += ab_dense[cj[k]];
302: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
303: }
304: flops += cnzi;
305: cj = PetscSafePointerPlusOffset(cj, cnzi);
306: ca += cnzi;
307: }
308: #if defined(PETSC_HAVE_DEVICE)
309: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
310: #endif
311: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
312: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
313: PetscCall(PetscLogFlops(flops));
314: PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
315: PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
316: PetscFunctionReturn(PETSC_SUCCESS);
317: }
319: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
320: {
321: PetscLogDouble flops = 0.0;
322: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
323: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
324: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
325: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
326: PetscInt am = A->rmap->N, cm = C->rmap->N;
327: PetscInt i, j, k, anzi, bnzi, cnzi, brow;
328: PetscScalar *ca = c->a, valtmp;
329: const PetscScalar *aa, *ba, *baj;
330: PetscInt nextb;
332: PetscFunctionBegin;
333: PetscCall(MatSeqAIJGetArrayRead(A, &aa));
334: PetscCall(MatSeqAIJGetArrayRead(B, &ba));
335: if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
336: PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
337: c->a = ca;
338: c->free_a = PETSC_TRUE;
339: }
341: /* clean old values in C */
342: PetscCall(PetscArrayzero(ca, ci[cm]));
343: /* Traverse A row-wise. */
344: /* Build the ith row in C by summing over nonzero columns in A, */
345: /* the rows of B corresponding to nonzeros of A. */
346: for (i = 0; i < am; i++) {
347: anzi = ai[i + 1] - ai[i];
348: cnzi = ci[i + 1] - ci[i];
349: for (j = 0; j < anzi; j++) {
350: brow = aj[j];
351: bnzi = bi[brow + 1] - bi[brow];
352: bjj = bj + bi[brow];
353: baj = ba + bi[brow];
354: /* perform sparse axpy */
355: valtmp = aa[j];
356: nextb = 0;
357: for (k = 0; nextb < bnzi; k++) {
358: if (cj[k] == bjj[nextb]) { /* ccol == bcol */
359: ca[k] += valtmp * baj[nextb++];
360: }
361: }
362: flops += 2 * bnzi;
363: }
364: aj += anzi;
365: aa += anzi;
366: cj += cnzi;
367: ca += cnzi;
368: }
369: #if defined(PETSC_HAVE_DEVICE)
370: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
371: #endif
372: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
373: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
374: PetscCall(PetscLogFlops(flops));
375: PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
376: PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
377: PetscFunctionReturn(PETSC_SUCCESS);
378: }
380: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
381: {
382: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
383: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
384: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
385: MatScalar *ca;
386: PetscReal afill;
387: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
388: PetscHMapI ta;
389: PetscFreeSpaceList free_space = NULL, current_space = NULL;
391: PetscFunctionBegin;
392: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
393: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
394: PetscCall(PetscMalloc1(am + 2, &ci));
395: ci[0] = 0;
397: /* create and initialize a linked list */
398: PetscCall(PetscHMapICreateWithSize(bn, &ta));
399: MatRowMergeMax_SeqAIJ(b, bm, ta);
400: PetscCall(PetscHMapIGetSize(ta, &Crmax));
401: PetscCall(PetscHMapIDestroy(&ta));
403: PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
405: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
406: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
407: current_space = free_space;
409: /* Determine ci and cj */
410: for (i = 0; i < am; i++) {
411: anzi = ai[i + 1] - ai[i];
412: aj = a->j + ai[i];
413: for (j = 0; j < anzi; j++) {
414: brow = aj[j];
415: bnzj = bi[brow + 1] - bi[brow];
416: bj = b->j + bi[brow];
417: /* add non-zero cols of B into the sorted linked list lnk */
418: PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
419: }
420: /* add possible missing diagonal entry */
421: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
422: cnzi = lnk[1];
424: /* If free space is not available, make more free space */
425: /* Double the amount of total space in the list */
426: if (current_space->local_remaining < cnzi) {
427: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
428: ndouble++;
429: }
431: /* Copy data into free space, then initialize lnk */
432: PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
434: current_space->array += cnzi;
435: current_space->local_used += cnzi;
436: current_space->local_remaining -= cnzi;
438: ci[i + 1] = ci[i] + cnzi;
439: }
441: /* Column indices are in the list of free space */
442: /* Allocate space for cj, initialize cj, and */
443: /* destroy list of free space and other temporary array(s) */
444: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
445: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
446: PetscCall(PetscLLCondensedDestroy_fast(lnk));
448: /* Allocate space for ca */
449: PetscCall(PetscCalloc1(ci[am] + 1, &ca));
451: /* put together the new symbolic matrix */
452: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
453: PetscCall(MatSetBlockSizesFromMats(C, A, B));
455: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
456: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
457: c = (Mat_SeqAIJ *)C->data;
458: c->free_a = PETSC_TRUE;
459: c->free_ij = PETSC_TRUE;
460: c->nonew = 0;
462: /* slower, less memory */
463: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
465: /* set MatInfo */
466: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
467: if (afill < 1.0) afill = 1.0;
468: C->info.mallocs = ndouble;
469: C->info.fill_ratio_given = fill;
470: C->info.fill_ratio_needed = afill;
472: #if defined(PETSC_USE_INFO)
473: if (ci[am]) {
474: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
475: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
476: } else {
477: PetscCall(PetscInfo(C, "Empty matrix product\n"));
478: }
479: #endif
480: PetscFunctionReturn(PETSC_SUCCESS);
481: }
483: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
484: {
485: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
486: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
487: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
488: MatScalar *ca;
489: PetscReal afill;
490: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
491: PetscHMapI ta;
492: PetscFreeSpaceList free_space = NULL, current_space = NULL;
494: PetscFunctionBegin;
495: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
496: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
497: PetscCall(PetscMalloc1(am + 2, &ci));
498: ci[0] = 0;
500: /* create and initialize a linked list */
501: PetscCall(PetscHMapICreateWithSize(bn, &ta));
502: MatRowMergeMax_SeqAIJ(b, bm, ta);
503: PetscCall(PetscHMapIGetSize(ta, &Crmax));
504: PetscCall(PetscHMapIDestroy(&ta));
505: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
507: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
508: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
509: current_space = free_space;
511: /* Determine ci and cj */
512: for (i = 0; i < am; i++) {
513: anzi = ai[i + 1] - ai[i];
514: aj = a->j + ai[i];
515: for (j = 0; j < anzi; j++) {
516: brow = aj[j];
517: bnzj = bi[brow + 1] - bi[brow];
518: bj = b->j + bi[brow];
519: /* add non-zero cols of B into the sorted linked list lnk */
520: PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
521: }
522: /* add possible missing diagonal entry */
523: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
525: cnzi = lnk[0];
527: /* If free space is not available, make more free space */
528: /* Double the amount of total space in the list */
529: if (current_space->local_remaining < cnzi) {
530: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
531: ndouble++;
532: }
534: /* Copy data into free space, then initialize lnk */
535: PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
537: current_space->array += cnzi;
538: current_space->local_used += cnzi;
539: current_space->local_remaining -= cnzi;
541: ci[i + 1] = ci[i] + cnzi;
542: }
544: /* Column indices are in the list of free space */
545: /* Allocate space for cj, initialize cj, and */
546: /* destroy list of free space and other temporary array(s) */
547: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
548: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
549: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
551: /* Allocate space for ca */
552: PetscCall(PetscCalloc1(ci[am] + 1, &ca));
554: /* put together the new symbolic matrix */
555: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
556: PetscCall(MatSetBlockSizesFromMats(C, A, B));
558: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
559: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
560: c = (Mat_SeqAIJ *)C->data;
561: c->free_a = PETSC_TRUE;
562: c->free_ij = PETSC_TRUE;
563: c->nonew = 0;
565: /* slower, less memory */
566: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
568: /* set MatInfo */
569: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
570: if (afill < 1.0) afill = 1.0;
571: C->info.mallocs = ndouble;
572: C->info.fill_ratio_given = fill;
573: C->info.fill_ratio_needed = afill;
575: #if defined(PETSC_USE_INFO)
576: if (ci[am]) {
577: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
578: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
579: } else {
580: PetscCall(PetscInfo(C, "Empty matrix product\n"));
581: }
582: #endif
583: PetscFunctionReturn(PETSC_SUCCESS);
584: }
586: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
587: {
588: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
589: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
590: PetscInt *ci, *cj, *bb;
591: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
592: PetscReal afill;
593: PetscInt i, j, col, ndouble = 0;
594: PetscFreeSpaceList free_space = NULL, current_space = NULL;
595: PetscHeap h;
597: PetscFunctionBegin;
598: /* Get ci and cj - by merging sorted rows using a heap */
599: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
600: PetscCall(PetscMalloc1(am + 2, &ci));
601: ci[0] = 0;
603: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
604: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
605: current_space = free_space;
607: PetscCall(PetscHeapCreate(a->rmax, &h));
608: PetscCall(PetscMalloc1(a->rmax, &bb));
610: /* Determine ci and cj */
611: for (i = 0; i < am; i++) {
612: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
613: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
614: ci[i + 1] = ci[i];
615: /* Populate the min heap */
616: for (j = 0; j < anzi; j++) {
617: bb[j] = bi[acol[j]]; /* bb points at the start of the row */
618: if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
619: PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
620: }
621: }
622: /* Pick off the min element, adding it to free space */
623: PetscCall(PetscHeapPop(h, &j, &col));
624: while (j >= 0) {
625: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
626: PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
627: ndouble++;
628: }
629: *(current_space->array++) = col;
630: current_space->local_used++;
631: current_space->local_remaining--;
632: ci[i + 1]++;
634: /* stash if anything else remains in this row of B */
635: if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
636: while (1) { /* pop and stash any other rows of B that also had an entry in this column */
637: PetscInt j2, col2;
638: PetscCall(PetscHeapPeek(h, &j2, &col2));
639: if (col2 != col) break;
640: PetscCall(PetscHeapPop(h, &j2, &col2));
641: if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
642: }
643: /* Put any stashed elements back into the min heap */
644: PetscCall(PetscHeapUnstash(h));
645: PetscCall(PetscHeapPop(h, &j, &col));
646: }
647: }
648: PetscCall(PetscFree(bb));
649: PetscCall(PetscHeapDestroy(&h));
651: /* Column indices are in the list of free space */
652: /* Allocate space for cj, initialize cj, and */
653: /* destroy list of free space and other temporary array(s) */
654: PetscCall(PetscMalloc1(ci[am], &cj));
655: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
657: /* put together the new symbolic matrix */
658: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
659: PetscCall(MatSetBlockSizesFromMats(C, A, B));
661: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
662: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
663: c = (Mat_SeqAIJ *)C->data;
664: c->free_a = PETSC_TRUE;
665: c->free_ij = PETSC_TRUE;
666: c->nonew = 0;
668: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
670: /* set MatInfo */
671: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
672: if (afill < 1.0) afill = 1.0;
673: C->info.mallocs = ndouble;
674: C->info.fill_ratio_given = fill;
675: C->info.fill_ratio_needed = afill;
677: #if defined(PETSC_USE_INFO)
678: if (ci[am]) {
679: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
680: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
681: } else {
682: PetscCall(PetscInfo(C, "Empty matrix product\n"));
683: }
684: #endif
685: PetscFunctionReturn(PETSC_SUCCESS);
686: }
688: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
689: {
690: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
691: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
692: PetscInt *ci, *cj, *bb;
693: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
694: PetscReal afill;
695: PetscInt i, j, col, ndouble = 0;
696: PetscFreeSpaceList free_space = NULL, current_space = NULL;
697: PetscHeap h;
698: PetscBT bt;
700: PetscFunctionBegin;
701: /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
702: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
703: PetscCall(PetscMalloc1(am + 2, &ci));
704: ci[0] = 0;
706: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
707: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
709: current_space = free_space;
711: PetscCall(PetscHeapCreate(a->rmax, &h));
712: PetscCall(PetscMalloc1(a->rmax, &bb));
713: PetscCall(PetscBTCreate(bn, &bt));
715: /* Determine ci and cj */
716: for (i = 0; i < am; i++) {
717: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
718: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
719: const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
720: ci[i + 1] = ci[i];
721: /* Populate the min heap */
722: for (j = 0; j < anzi; j++) {
723: PetscInt brow = acol[j];
724: for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
725: PetscInt bcol = bj[bb[j]];
726: if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
727: PetscCall(PetscHeapAdd(h, j, bcol));
728: bb[j]++;
729: break;
730: }
731: }
732: }
733: /* Pick off the min element, adding it to free space */
734: PetscCall(PetscHeapPop(h, &j, &col));
735: while (j >= 0) {
736: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
737: fptr = NULL; /* need PetscBTMemzero */
738: PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
739: ndouble++;
740: }
741: *(current_space->array++) = col;
742: current_space->local_used++;
743: current_space->local_remaining--;
744: ci[i + 1]++;
746: /* stash if anything else remains in this row of B */
747: for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
748: PetscInt bcol = bj[bb[j]];
749: if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
750: PetscCall(PetscHeapAdd(h, j, bcol));
751: bb[j]++;
752: break;
753: }
754: }
755: PetscCall(PetscHeapPop(h, &j, &col));
756: }
757: if (fptr) { /* Clear the bits for this row */
758: for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
759: } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
760: PetscCall(PetscBTMemzero(bn, bt));
761: }
762: }
763: PetscCall(PetscFree(bb));
764: PetscCall(PetscHeapDestroy(&h));
765: PetscCall(PetscBTDestroy(&bt));
767: /* Column indices are in the list of free space */
768: /* Allocate space for cj, initialize cj, and */
769: /* destroy list of free space and other temporary array(s) */
770: PetscCall(PetscMalloc1(ci[am], &cj));
771: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
773: /* put together the new symbolic matrix */
774: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
775: PetscCall(MatSetBlockSizesFromMats(C, A, B));
777: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
778: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
779: c = (Mat_SeqAIJ *)C->data;
780: c->free_a = PETSC_TRUE;
781: c->free_ij = PETSC_TRUE;
782: c->nonew = 0;
784: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
786: /* set MatInfo */
787: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
788: if (afill < 1.0) afill = 1.0;
789: C->info.mallocs = ndouble;
790: C->info.fill_ratio_given = fill;
791: C->info.fill_ratio_needed = afill;
793: #if defined(PETSC_USE_INFO)
794: if (ci[am]) {
795: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
796: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
797: } else {
798: PetscCall(PetscInfo(C, "Empty matrix product\n"));
799: }
800: #endif
801: PetscFunctionReturn(PETSC_SUCCESS);
802: }
804: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
805: {
806: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
807: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
808: PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
809: PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz;
810: const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
811: const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
812: const PetscInt *brow_ptr[8], *brow_end[8];
813: PetscInt window[8];
814: PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
815: PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft;
816: PetscReal afill;
817: PetscInt *workj_L1, *workj_L2, *workj_L3;
818: PetscInt L1_nnz, L2_nnz;
820: /* Step 1: Get upper bound on memory required for allocation.
821: Because of the way virtual memory works,
822: only the memory pages that are actually needed will be physically allocated. */
823: PetscFunctionBegin;
824: PetscCall(PetscMalloc1(am + 1, &ci));
825: for (i = 0; i < am; i++) {
826: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
827: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
828: a_rownnz = 0;
829: for (k = 0; k < anzi; ++k) {
830: a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
831: if (a_rownnz > bn) {
832: a_rownnz = bn;
833: break;
834: }
835: }
836: a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
837: }
838: /* temporary work areas for merging rows */
839: PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
840: PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
841: PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
843: /* This should be enough for almost all matrices. If not, memory is reallocated later. */
844: c_maxmem = 8 * (ai[am] + bi[bm]);
845: /* Step 2: Populate pattern for C */
846: PetscCall(PetscMalloc1(c_maxmem, &cj));
848: ci_nnz = 0;
849: ci[0] = 0;
850: worki_L1[0] = 0;
851: worki_L2[0] = 0;
852: for (i = 0; i < am; i++) {
853: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
854: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
855: rowsleft = anzi;
856: inputcol_L1 = acol;
857: L2_nnz = 0;
858: L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */
859: worki_L2[1] = 0;
860: outputi_nnz = 0;
862: /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */
863: while (ci_nnz + a_maxrownnz > c_maxmem) {
864: c_maxmem *= 2;
865: ndouble++;
866: PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
867: }
869: while (rowsleft) {
870: L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
871: L1_nrows = 0;
872: L1_nnz = 0;
873: inputcol = inputcol_L1;
874: inputi = bi;
875: inputj = bj;
877: /* The following macro is used to specialize for small rows in A.
878: This helps with compiler unrolling, improving performance substantially.
879: Input: inputj inputi inputcol bn
880: Output: outputj outputi_nnz */
881: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
882: do { \
883: window_min = bn; \
884: outputi_nnz = 0; \
885: for (k = 0; k < ANNZ; ++k) { \
886: brow_ptr[k] = inputj + inputi[inputcol[k]]; \
887: brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
888: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
889: window_min = PetscMin(window[k], window_min); \
890: } \
891: while (window_min < bn) { \
892: outputj[outputi_nnz++] = window_min; \
893: /* advance front and compute new minimum */ \
894: old_window_min = window_min; \
895: window_min = bn; \
896: for (k = 0; k < ANNZ; ++k) { \
897: if (window[k] == old_window_min) { \
898: brow_ptr[k]++; \
899: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900: } \
901: window_min = PetscMin(window[k], window_min); \
902: } \
903: } \
904: } while (0)
906: /************** L E V E L 1 ***************/
907: /* Merge up to 8 rows of B to L1 work array*/
908: while (L1_rowsleft) {
909: outputi_nnz = 0;
910: if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
911: else outputj = cj + ci_nnz; /* Merge directly to C */
913: switch (L1_rowsleft) {
914: case 1:
915: brow_ptr[0] = inputj + inputi[inputcol[0]];
916: brow_end[0] = inputj + inputi[inputcol[0] + 1];
917: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
918: inputcol += L1_rowsleft;
919: rowsleft -= L1_rowsleft;
920: L1_rowsleft = 0;
921: break;
922: case 2:
923: MatMatMultSymbolic_RowMergeMacro(2);
924: inputcol += L1_rowsleft;
925: rowsleft -= L1_rowsleft;
926: L1_rowsleft = 0;
927: break;
928: case 3:
929: MatMatMultSymbolic_RowMergeMacro(3);
930: inputcol += L1_rowsleft;
931: rowsleft -= L1_rowsleft;
932: L1_rowsleft = 0;
933: break;
934: case 4:
935: MatMatMultSymbolic_RowMergeMacro(4);
936: inputcol += L1_rowsleft;
937: rowsleft -= L1_rowsleft;
938: L1_rowsleft = 0;
939: break;
940: case 5:
941: MatMatMultSymbolic_RowMergeMacro(5);
942: inputcol += L1_rowsleft;
943: rowsleft -= L1_rowsleft;
944: L1_rowsleft = 0;
945: break;
946: case 6:
947: MatMatMultSymbolic_RowMergeMacro(6);
948: inputcol += L1_rowsleft;
949: rowsleft -= L1_rowsleft;
950: L1_rowsleft = 0;
951: break;
952: case 7:
953: MatMatMultSymbolic_RowMergeMacro(7);
954: inputcol += L1_rowsleft;
955: rowsleft -= L1_rowsleft;
956: L1_rowsleft = 0;
957: break;
958: default:
959: MatMatMultSymbolic_RowMergeMacro(8);
960: inputcol += 8;
961: rowsleft -= 8;
962: L1_rowsleft -= 8;
963: break;
964: }
965: inputcol_L1 = inputcol;
966: L1_nnz += outputi_nnz;
967: worki_L1[++L1_nrows] = L1_nnz;
968: }
970: /********************** L E V E L 2 ************************/
971: /* Merge from L1 work array to either C or to L2 work array */
972: if (anzi > 8) {
973: inputi = worki_L1;
974: inputj = workj_L1;
975: inputcol = workcol;
976: outputi_nnz = 0;
978: if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
979: else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */
981: switch (L1_nrows) {
982: case 1:
983: brow_ptr[0] = inputj + inputi[inputcol[0]];
984: brow_end[0] = inputj + inputi[inputcol[0] + 1];
985: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
986: break;
987: case 2:
988: MatMatMultSymbolic_RowMergeMacro(2);
989: break;
990: case 3:
991: MatMatMultSymbolic_RowMergeMacro(3);
992: break;
993: case 4:
994: MatMatMultSymbolic_RowMergeMacro(4);
995: break;
996: case 5:
997: MatMatMultSymbolic_RowMergeMacro(5);
998: break;
999: case 6:
1000: MatMatMultSymbolic_RowMergeMacro(6);
1001: break;
1002: case 7:
1003: MatMatMultSymbolic_RowMergeMacro(7);
1004: break;
1005: case 8:
1006: MatMatMultSymbolic_RowMergeMacro(8);
1007: break;
1008: default:
1009: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1010: }
1011: L2_nnz += outputi_nnz;
1012: worki_L2[++L2_nrows] = L2_nnz;
1014: /************************ L E V E L 3 **********************/
1015: /* Merge from L2 work array to either C or to L2 work array */
1016: if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1017: inputi = worki_L2;
1018: inputj = workj_L2;
1019: inputcol = workcol;
1020: outputi_nnz = 0;
1021: if (rowsleft) outputj = workj_L3;
1022: else outputj = cj + ci_nnz;
1023: switch (L2_nrows) {
1024: case 1:
1025: brow_ptr[0] = inputj + inputi[inputcol[0]];
1026: brow_end[0] = inputj + inputi[inputcol[0] + 1];
1027: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1028: break;
1029: case 2:
1030: MatMatMultSymbolic_RowMergeMacro(2);
1031: break;
1032: case 3:
1033: MatMatMultSymbolic_RowMergeMacro(3);
1034: break;
1035: case 4:
1036: MatMatMultSymbolic_RowMergeMacro(4);
1037: break;
1038: case 5:
1039: MatMatMultSymbolic_RowMergeMacro(5);
1040: break;
1041: case 6:
1042: MatMatMultSymbolic_RowMergeMacro(6);
1043: break;
1044: case 7:
1045: MatMatMultSymbolic_RowMergeMacro(7);
1046: break;
1047: case 8:
1048: MatMatMultSymbolic_RowMergeMacro(8);
1049: break;
1050: default:
1051: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1052: }
1053: L2_nrows = 1;
1054: L2_nnz = outputi_nnz;
1055: worki_L2[1] = outputi_nnz;
1056: /* Copy to workj_L2 */
1057: if (rowsleft) {
1058: for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1059: }
1060: }
1061: }
1062: } /* while (rowsleft) */
1063: #undef MatMatMultSymbolic_RowMergeMacro
1065: /* terminate current row */
1066: ci_nnz += outputi_nnz;
1067: ci[i + 1] = ci_nnz;
1068: }
1070: /* Step 3: Create the new symbolic matrix */
1071: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1072: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1074: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1075: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1076: c = (Mat_SeqAIJ *)C->data;
1077: c->free_a = PETSC_TRUE;
1078: c->free_ij = PETSC_TRUE;
1079: c->nonew = 0;
1081: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1083: /* set MatInfo */
1084: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1085: if (afill < 1.0) afill = 1.0;
1086: C->info.mallocs = ndouble;
1087: C->info.fill_ratio_given = fill;
1088: C->info.fill_ratio_needed = afill;
1090: #if defined(PETSC_USE_INFO)
1091: if (ci[am]) {
1092: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1093: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1094: } else {
1095: PetscCall(PetscInfo(C, "Empty matrix product\n"));
1096: }
1097: #endif
1099: /* Step 4: Free temporary work areas */
1100: PetscCall(PetscFree(workj_L1));
1101: PetscCall(PetscFree(workj_L2));
1102: PetscCall(PetscFree(workj_L3));
1103: PetscFunctionReturn(PETSC_SUCCESS);
1104: }
1106: /* concatenate unique entries and then sort */
1107: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1108: {
1109: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1110: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1111: PetscInt *ci, *cj, bcol;
1112: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1113: PetscReal afill;
1114: PetscInt i, j, ndouble = 0;
1115: PetscSegBuffer seg, segrow;
1116: char *seen;
1118: PetscFunctionBegin;
1119: PetscCall(PetscMalloc1(am + 1, &ci));
1120: ci[0] = 0;
1122: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1123: PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1124: PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1125: PetscCall(PetscCalloc1(bn, &seen));
1127: /* Determine ci and cj */
1128: for (i = 0; i < am; i++) {
1129: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1130: const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1131: PetscInt packlen = 0, *PETSC_RESTRICT crow;
1133: /* Pack segrow */
1134: for (j = 0; j < anzi; j++) {
1135: PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1136: for (k = bjstart; k < bjend; k++) {
1137: bcol = bj[k];
1138: if (!seen[bcol]) { /* new entry */
1139: PetscInt *PETSC_RESTRICT slot;
1140: PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1141: *slot = bcol;
1142: seen[bcol] = 1;
1143: packlen++;
1144: }
1145: }
1146: }
1148: /* Check i-th diagonal entry */
1149: if (C->force_diagonals && !seen[i]) {
1150: PetscInt *PETSC_RESTRICT slot;
1151: PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1152: *slot = i;
1153: seen[i] = 1;
1154: packlen++;
1155: }
1157: PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1158: PetscCall(PetscSegBufferExtractTo(segrow, crow));
1159: PetscCall(PetscSortInt(packlen, crow));
1160: ci[i + 1] = ci[i] + packlen;
1161: for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1162: }
1163: PetscCall(PetscSegBufferDestroy(&segrow));
1164: PetscCall(PetscFree(seen));
1166: /* Column indices are in the segmented buffer */
1167: PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1168: PetscCall(PetscSegBufferDestroy(&seg));
1170: /* put together the new symbolic matrix */
1171: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1172: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1174: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1175: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1176: c = (Mat_SeqAIJ *)C->data;
1177: c->free_a = PETSC_TRUE;
1178: c->free_ij = PETSC_TRUE;
1179: c->nonew = 0;
1181: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1183: /* set MatInfo */
1184: afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1185: if (afill < 1.0) afill = 1.0;
1186: C->info.mallocs = ndouble;
1187: C->info.fill_ratio_given = fill;
1188: C->info.fill_ratio_needed = afill;
1190: #if defined(PETSC_USE_INFO)
1191: if (ci[am]) {
1192: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1193: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1194: } else {
1195: PetscCall(PetscInfo(C, "Empty matrix product\n"));
1196: }
1197: #endif
1198: PetscFunctionReturn(PETSC_SUCCESS);
1199: }
1201: static PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(void **data)
1202: {
1203: MatProductCtx_MatMatTransMult *abt = *(MatProductCtx_MatMatTransMult **)data;
1205: PetscFunctionBegin;
1206: PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1207: PetscCall(MatDestroy(&abt->Bt_den));
1208: PetscCall(MatDestroy(&abt->ABt_den));
1209: PetscCall(PetscFree(abt));
1210: PetscFunctionReturn(PETSC_SUCCESS);
1211: }
1213: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1214: {
1215: Mat Bt;
1216: MatProductCtx_MatMatTransMult *abt;
1217: Mat_Product *product = C->product;
1218: char *alg;
1220: PetscFunctionBegin;
1221: PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1222: PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1224: /* create symbolic Bt */
1225: PetscCall(MatTransposeSymbolic(B, &Bt));
1226: PetscCall(MatSetBlockSizes(Bt, A->cmap->bs, B->cmap->bs));
1227: PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
1229: /* get symbolic C=A*Bt */
1230: PetscCall(PetscStrallocpy(product->alg, &alg));
1231: PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1232: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1233: PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1234: PetscCall(PetscFree(alg));
1236: /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1237: PetscCall(PetscNew(&abt));
1239: product->data = abt;
1240: product->destroy = MatProductCtxDestroy_SeqAIJ_MatMatMultTrans;
1242: C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1244: abt->usecoloring = PETSC_FALSE;
1245: PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1246: if (abt->usecoloring) {
1247: /* Create MatTransposeColoring from symbolic C=A*B^T */
1248: MatTransposeColoring matcoloring;
1249: MatColoring coloring;
1250: ISColoring iscoloring;
1251: Mat Bt_dense, C_dense;
1253: /* inode causes memory problem */
1254: PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
1256: PetscCall(MatColoringCreate(C, &coloring));
1257: PetscCall(MatColoringSetDistance(coloring, 2));
1258: PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1259: PetscCall(MatColoringSetFromOptions(coloring));
1260: PetscCall(MatColoringApply(coloring, &iscoloring));
1261: PetscCall(MatColoringDestroy(&coloring));
1262: PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
1264: abt->matcoloring = matcoloring;
1266: PetscCall(ISColoringDestroy(&iscoloring));
1268: /* Create Bt_dense and C_dense = A*Bt_dense */
1269: PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1270: PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1271: PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1272: PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
1274: Bt_dense->assembled = PETSC_TRUE;
1275: abt->Bt_den = Bt_dense;
1277: PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1278: PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1279: PetscCall(MatSetType(C_dense, MATSEQDENSE));
1280: PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
1282: Bt_dense->assembled = PETSC_TRUE;
1283: abt->ABt_den = C_dense;
1285: #if defined(PETSC_USE_INFO)
1286: {
1287: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1288: PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1289: Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1290: }
1291: #endif
1292: }
1293: /* clean up */
1294: PetscCall(MatDestroy(&Bt));
1295: PetscFunctionReturn(PETSC_SUCCESS);
1296: }
1298: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1299: {
1300: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1301: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1302: PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1303: PetscLogDouble flops = 0.0;
1304: MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1305: MatProductCtx_MatMatTransMult *abt;
1306: Mat_Product *product = C->product;
1308: PetscFunctionBegin;
1309: PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1310: abt = (MatProductCtx_MatMatTransMult *)product->data;
1311: PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1312: /* clear old values in C */
1313: if (!c->a) {
1314: PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1315: c->a = ca;
1316: c->free_a = PETSC_TRUE;
1317: } else {
1318: ca = c->a;
1319: PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1320: }
1322: if (abt->usecoloring) {
1323: MatTransposeColoring matcoloring = abt->matcoloring;
1324: Mat Bt_dense, C_dense = abt->ABt_den;
1326: /* Get Bt_dense by Apply MatTransposeColoring to B */
1327: Bt_dense = abt->Bt_den;
1328: PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1330: /* C_dense = A*Bt_dense */
1331: PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1333: /* Recover C from C_dense */
1334: PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1335: PetscFunctionReturn(PETSC_SUCCESS);
1336: }
1338: for (i = 0; i < cm; i++) {
1339: anzi = ai[i + 1] - ai[i];
1340: acol = PetscSafePointerPlusOffset(aj, ai[i]);
1341: aval = PetscSafePointerPlusOffset(aa, ai[i]);
1342: cnzi = ci[i + 1] - ci[i];
1343: ccol = PetscSafePointerPlusOffset(cj, ci[i]);
1344: cval = ca + ci[i];
1345: for (j = 0; j < cnzi; j++) {
1346: brow = ccol[j];
1347: bnzj = bi[brow + 1] - bi[brow];
1348: bcol = bj + bi[brow];
1349: bval = ba + bi[brow];
1351: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1352: nexta = 0;
1353: nextb = 0;
1354: while (nexta < anzi && nextb < bnzj) {
1355: while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1356: if (nexta == anzi) break;
1357: while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1358: if (nextb == bnzj) break;
1359: if (acol[nexta] == bcol[nextb]) {
1360: cval[j] += aval[nexta] * bval[nextb];
1361: nexta++;
1362: nextb++;
1363: flops += 2;
1364: }
1365: }
1366: }
1367: }
1368: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1369: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1370: PetscCall(PetscLogFlops(flops));
1371: PetscFunctionReturn(PETSC_SUCCESS);
1372: }
1374: PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatTransMatMult(void **data)
1375: {
1376: MatProductCtx_MatTransMatMult *atb = *(MatProductCtx_MatTransMatMult **)data;
1378: PetscFunctionBegin;
1379: PetscCall(MatDestroy(&atb->At));
1380: if (atb->destroy) PetscCall((*atb->destroy)(&atb->data));
1381: PetscCall(PetscFree(atb));
1382: PetscFunctionReturn(PETSC_SUCCESS);
1383: }
1385: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1386: {
1387: Mat At = NULL;
1388: Mat_Product *product = C->product;
1389: PetscBool flg, def, square;
1391: PetscFunctionBegin;
1392: MatCheckProduct(C, 4);
1393: square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1394: /* outerproduct */
1395: PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1396: if (flg) {
1397: /* create symbolic At */
1398: if (!square) {
1399: PetscCall(MatTransposeSymbolic(A, &At));
1400: PetscCall(MatSetBlockSizes(At, A->cmap->bs, B->cmap->bs));
1401: PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1402: }
1403: /* get symbolic C=At*B */
1404: PetscCall(MatProductSetAlgorithm(C, "sorted"));
1405: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1407: /* clean up */
1408: if (!square) PetscCall(MatDestroy(&At));
1410: C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1411: PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1412: PetscFunctionReturn(PETSC_SUCCESS);
1413: }
1415: /* matmatmult */
1416: PetscCall(PetscStrcmp(product->alg, "default", &def));
1417: PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1418: if (flg || def) {
1419: MatProductCtx_MatTransMatMult *atb;
1421: PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1422: PetscCall(PetscNew(&atb));
1423: if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1424: PetscCall(MatProductSetAlgorithm(C, "sorted"));
1425: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1426: PetscCall(MatProductSetAlgorithm(C, "at*b"));
1427: product->data = atb;
1428: product->destroy = MatProductCtxDestroy_SeqAIJ_MatTransMatMult;
1429: atb->At = At;
1431: C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1432: PetscFunctionReturn(PETSC_SUCCESS);
1433: }
1435: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1436: }
1438: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1439: {
1440: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1441: PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1442: PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1443: PetscLogDouble flops = 0.0;
1444: MatScalar *aa = a->a, *ba, *ca, *caj;
1446: PetscFunctionBegin;
1447: if (!c->a) {
1448: PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1450: c->a = ca;
1451: c->free_a = PETSC_TRUE;
1452: } else {
1453: ca = c->a;
1454: PetscCall(PetscArrayzero(ca, ci[cm]));
1455: }
1457: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1458: for (i = 0; i < am; i++) {
1459: bj = b->j + bi[i];
1460: ba = b->a + bi[i];
1461: bnzi = bi[i + 1] - bi[i];
1462: anzi = ai[i + 1] - ai[i];
1463: for (j = 0; j < anzi; j++) {
1464: nextb = 0;
1465: crow = *aj++;
1466: cjj = cj + ci[crow];
1467: caj = ca + ci[crow];
1468: /* perform sparse axpy operation. Note cjj includes bj. */
1469: for (k = 0; nextb < bnzi; k++) {
1470: if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1471: caj[k] += (*aa) * (*(ba + nextb));
1472: nextb++;
1473: }
1474: }
1475: flops += 2 * bnzi;
1476: aa++;
1477: }
1478: }
1480: /* Assemble the final matrix and clean up */
1481: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1482: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1483: PetscCall(PetscLogFlops(flops));
1484: PetscFunctionReturn(PETSC_SUCCESS);
1485: }
1487: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1488: {
1489: PetscFunctionBegin;
1490: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1491: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1492: PetscFunctionReturn(PETSC_SUCCESS);
1493: }
1495: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1496: {
1497: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1498: PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1499: const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1500: const PetscInt *aj;
1501: PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1502: PetscInt clda;
1503: PetscInt am4, bm4, col, i, j, n;
1505: PetscFunctionBegin;
1506: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1507: PetscCall(MatSeqAIJGetArrayRead(A, &av));
1508: if (add) {
1509: PetscCall(MatDenseGetArray(C, &c));
1510: } else {
1511: PetscCall(MatDenseGetArrayWrite(C, &c));
1512: }
1513: PetscCall(MatDenseGetArrayRead(B, &b));
1514: PetscCall(MatDenseGetLDA(B, &bm));
1515: PetscCall(MatDenseGetLDA(C, &clda));
1516: am4 = 4 * clda;
1517: bm4 = 4 * bm;
1518: if (b) {
1519: b1 = b;
1520: b2 = b1 + bm;
1521: b3 = b2 + bm;
1522: b4 = b3 + bm;
1523: } else b1 = b2 = b3 = b4 = NULL;
1524: c1 = c;
1525: c2 = c1 + clda;
1526: c3 = c2 + clda;
1527: c4 = c3 + clda;
1528: for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1529: for (i = 0; i < am; i++) { /* over rows of A in those columns */
1530: r1 = r2 = r3 = r4 = 0.0;
1531: n = a->i[i + 1] - a->i[i];
1532: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1533: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1534: for (j = 0; j < n; j++) {
1535: const PetscScalar aatmp = aa[j];
1536: const PetscInt ajtmp = aj[j];
1537: r1 += aatmp * b1[ajtmp];
1538: r2 += aatmp * b2[ajtmp];
1539: r3 += aatmp * b3[ajtmp];
1540: r4 += aatmp * b4[ajtmp];
1541: }
1542: if (add) {
1543: c1[i] += r1;
1544: c2[i] += r2;
1545: c3[i] += r3;
1546: c4[i] += r4;
1547: } else {
1548: c1[i] = r1;
1549: c2[i] = r2;
1550: c3[i] = r3;
1551: c4[i] = r4;
1552: }
1553: }
1554: if (b) {
1555: b1 += bm4;
1556: b2 += bm4;
1557: b3 += bm4;
1558: b4 += bm4;
1559: }
1560: c1 += am4;
1561: c2 += am4;
1562: c3 += am4;
1563: c4 += am4;
1564: }
1565: /* process remaining columns */
1566: if (col != cn) {
1567: PetscInt rc = cn - col;
1569: if (rc == 1) {
1570: for (i = 0; i < am; i++) {
1571: r1 = 0.0;
1572: n = a->i[i + 1] - a->i[i];
1573: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1574: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1575: for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1576: if (add) c1[i] += r1;
1577: else c1[i] = r1;
1578: }
1579: } else if (rc == 2) {
1580: for (i = 0; i < am; i++) {
1581: r1 = r2 = 0.0;
1582: n = a->i[i + 1] - a->i[i];
1583: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1584: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1585: for (j = 0; j < n; j++) {
1586: const PetscScalar aatmp = aa[j];
1587: const PetscInt ajtmp = aj[j];
1588: r1 += aatmp * b1[ajtmp];
1589: r2 += aatmp * b2[ajtmp];
1590: }
1591: if (add) {
1592: c1[i] += r1;
1593: c2[i] += r2;
1594: } else {
1595: c1[i] = r1;
1596: c2[i] = r2;
1597: }
1598: }
1599: } else {
1600: for (i = 0; i < am; i++) {
1601: r1 = r2 = r3 = 0.0;
1602: n = a->i[i + 1] - a->i[i];
1603: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1604: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1605: for (j = 0; j < n; j++) {
1606: const PetscScalar aatmp = aa[j];
1607: const PetscInt ajtmp = aj[j];
1608: r1 += aatmp * b1[ajtmp];
1609: r2 += aatmp * b2[ajtmp];
1610: r3 += aatmp * b3[ajtmp];
1611: }
1612: if (add) {
1613: c1[i] += r1;
1614: c2[i] += r2;
1615: c3[i] += r3;
1616: } else {
1617: c1[i] = r1;
1618: c2[i] = r2;
1619: c3[i] = r3;
1620: }
1621: }
1622: }
1623: }
1624: PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1625: if (add) {
1626: PetscCall(MatDenseRestoreArray(C, &c));
1627: } else {
1628: PetscCall(MatDenseRestoreArrayWrite(C, &c));
1629: }
1630: PetscCall(MatDenseRestoreArrayRead(B, &b));
1631: PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1632: PetscFunctionReturn(PETSC_SUCCESS);
1633: }
1635: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1636: {
1637: PetscFunctionBegin;
1638: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1639: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1640: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1642: PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1643: PetscFunctionReturn(PETSC_SUCCESS);
1644: }
1646: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1647: {
1648: PetscFunctionBegin;
1649: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1650: C->ops->productsymbolic = MatProductSymbolic_AB;
1651: PetscFunctionReturn(PETSC_SUCCESS);
1652: }
1654: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
1656: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1657: {
1658: PetscFunctionBegin;
1659: C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1660: C->ops->productsymbolic = MatProductSymbolic_AtB;
1661: PetscFunctionReturn(PETSC_SUCCESS);
1662: }
1664: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1665: {
1666: PetscFunctionBegin;
1667: C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1668: C->ops->productsymbolic = MatProductSymbolic_ABt;
1669: PetscFunctionReturn(PETSC_SUCCESS);
1670: }
1672: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1673: {
1674: Mat_Product *product = C->product;
1676: PetscFunctionBegin;
1677: switch (product->type) {
1678: case MATPRODUCT_AB:
1679: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1680: break;
1681: case MATPRODUCT_AtB:
1682: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1683: break;
1684: case MATPRODUCT_ABt:
1685: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1686: break;
1687: default:
1688: break;
1689: }
1690: PetscFunctionReturn(PETSC_SUCCESS);
1691: }
1693: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1694: {
1695: Mat_Product *product = C->product;
1696: Mat A = product->A;
1697: PetscBool baij;
1699: PetscFunctionBegin;
1700: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1701: if (!baij) { /* A is seqsbaij */
1702: PetscBool sbaij;
1703: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1704: PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
1706: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1707: } else { /* A is seqbaij */
1708: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1709: }
1711: C->ops->productsymbolic = MatProductSymbolic_AB;
1712: PetscFunctionReturn(PETSC_SUCCESS);
1713: }
1715: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1716: {
1717: Mat_Product *product = C->product;
1719: PetscFunctionBegin;
1720: MatCheckProduct(C, 1);
1721: PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1722: if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1723: else if (product->type == MATPRODUCT_AtB) {
1724: PetscBool flg;
1726: PetscCall(PetscObjectTypeCompare((PetscObject)product->A, MATSEQBAIJ, &flg));
1727: if (flg) {
1728: C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense;
1729: C->ops->productsymbolic = MatProductSymbolic_AtB;
1730: }
1731: }
1732: PetscFunctionReturn(PETSC_SUCCESS);
1733: }
1735: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1736: {
1737: PetscFunctionBegin;
1738: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1739: C->ops->productsymbolic = MatProductSymbolic_AB;
1740: PetscFunctionReturn(PETSC_SUCCESS);
1741: }
1743: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1744: {
1745: Mat_Product *product = C->product;
1747: PetscFunctionBegin;
1748: if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1749: PetscFunctionReturn(PETSC_SUCCESS);
1750: }
1752: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1753: {
1754: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1755: Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1756: PetscInt *bi = b->i, *bj = b->j;
1757: PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1758: MatScalar *btval, *btval_den, *ba = b->a;
1759: PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1761: PetscFunctionBegin;
1762: btval_den = btdense->v;
1763: PetscCall(PetscArrayzero(btval_den, m * n));
1764: for (k = 0; k < ncolors; k++) {
1765: ncolumns = coloring->ncolumns[k];
1766: for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1767: col = *(columns + colorforcol[k] + l);
1768: btcol = bj + bi[col];
1769: btval = ba + bi[col];
1770: anz = bi[col + 1] - bi[col];
1771: for (j = 0; j < anz; j++) {
1772: brow = btcol[j];
1773: btval_den[brow] = btval[j];
1774: }
1775: }
1776: btval_den += m;
1777: }
1778: PetscFunctionReturn(PETSC_SUCCESS);
1779: }
1781: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1782: {
1783: Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data;
1784: const PetscScalar *ca_den, *ca_den_ptr;
1785: PetscScalar *ca = csp->a;
1786: PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1787: PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1788: PetscInt nrows, *row, *idx;
1789: PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
1791: PetscFunctionBegin;
1792: PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1794: if (brows > 0) {
1795: PetscInt *lstart, row_end, row_start;
1796: lstart = matcoloring->lstart;
1797: PetscCall(PetscArrayzero(lstart, ncolors));
1799: row_end = brows;
1800: if (row_end > m) row_end = m;
1801: for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1802: ca_den_ptr = ca_den;
1803: for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1804: nrows = matcoloring->nrows[k];
1805: row = rows + colorforrow[k];
1806: idx = den2sp + colorforrow[k];
1807: for (l = lstart[k]; l < nrows; l++) {
1808: if (row[l] >= row_end) {
1809: lstart[k] = l;
1810: break;
1811: } else {
1812: ca[idx[l]] = ca_den_ptr[row[l]];
1813: }
1814: }
1815: ca_den_ptr += m;
1816: }
1817: row_end += brows;
1818: if (row_end > m) row_end = m;
1819: }
1820: } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1821: ca_den_ptr = ca_den;
1822: for (k = 0; k < ncolors; k++) {
1823: nrows = matcoloring->nrows[k];
1824: row = rows + colorforrow[k];
1825: idx = den2sp + colorforrow[k];
1826: for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1827: ca_den_ptr += m;
1828: }
1829: }
1831: PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1832: #if defined(PETSC_USE_INFO)
1833: if (matcoloring->brows > 0) {
1834: PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1835: } else {
1836: PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1837: }
1838: #endif
1839: PetscFunctionReturn(PETSC_SUCCESS);
1840: }
1842: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1843: {
1844: PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1845: const PetscInt *is, *ci, *cj, *row_idx;
1846: PetscInt nis = iscoloring->n, *rowhit, bs = 1;
1847: IS *isa;
1848: Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data;
1849: PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1850: PetscInt *colorforcol, *columns, *columns_i, brows;
1851: PetscBool flg;
1853: PetscFunctionBegin;
1854: PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1856: /* bs > 1 is not being tested yet! */
1857: Nbs = mat->cmap->N / bs;
1858: c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */
1859: c->N = Nbs;
1860: c->m = c->M;
1861: c->rstart = 0;
1862: c->brows = 100;
1864: c->ncolors = nis;
1865: PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1866: PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1867: PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1869: brows = c->brows;
1870: PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1871: if (flg) c->brows = brows;
1872: if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
1874: colorforrow[0] = 0;
1875: rows_i = rows;
1876: den2sp_i = den2sp;
1878: PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1879: PetscCall(PetscMalloc1(Nbs + 1, &columns));
1881: colorforcol[0] = 0;
1882: columns_i = columns;
1884: /* get column-wise storage of mat */
1885: PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1887: cm = c->m;
1888: PetscCall(PetscMalloc1(cm + 1, &rowhit));
1889: PetscCall(PetscMalloc1(cm + 1, &idxhit));
1890: for (i = 0; i < nis; i++) { /* loop over color */
1891: PetscCall(ISGetLocalSize(isa[i], &n));
1892: PetscCall(ISGetIndices(isa[i], &is));
1894: c->ncolumns[i] = n;
1895: if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1896: colorforcol[i + 1] = colorforcol[i] + n;
1897: columns_i += n;
1899: /* fast, crude version requires O(N*N) work */
1900: PetscCall(PetscArrayzero(rowhit, cm));
1902: for (j = 0; j < n; j++) { /* loop over columns*/
1903: col = is[j];
1904: row_idx = cj + ci[col];
1905: m = ci[col + 1] - ci[col];
1906: for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1907: idxhit[*row_idx] = spidx[ci[col] + k];
1908: rowhit[*row_idx++] = col + 1;
1909: }
1910: }
1911: /* count the number of hits */
1912: nrows = 0;
1913: for (j = 0; j < cm; j++) {
1914: if (rowhit[j]) nrows++;
1915: }
1916: c->nrows[i] = nrows;
1917: colorforrow[i + 1] = colorforrow[i] + nrows;
1919: nrows = 0;
1920: for (j = 0; j < cm; j++) { /* loop over rows */
1921: if (rowhit[j]) {
1922: rows_i[nrows] = j;
1923: den2sp_i[nrows] = idxhit[j];
1924: nrows++;
1925: }
1926: }
1927: den2sp_i += nrows;
1929: PetscCall(ISRestoreIndices(isa[i], &is));
1930: rows_i += nrows;
1931: }
1932: PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1933: PetscCall(PetscFree(rowhit));
1934: PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1935: PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1937: c->colorforrow = colorforrow;
1938: c->rows = rows;
1939: c->den2sp = den2sp;
1940: c->colorforcol = colorforcol;
1941: c->columns = columns;
1943: PetscCall(PetscFree(idxhit));
1944: PetscFunctionReturn(PETSC_SUCCESS);
1945: }
1947: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1948: {
1949: Mat_Product *product = C->product;
1950: Mat A = product->A, B = product->B;
1952: PetscFunctionBegin;
1953: if (C->ops->mattransposemultnumeric) {
1954: /* Alg: "outerproduct" */
1955: PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1956: } else {
1957: /* Alg: "matmatmult" -- C = At*B */
1958: MatProductCtx_MatTransMatMult *atb = (MatProductCtx_MatTransMatMult *)product->data;
1960: PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1961: if (atb->At) {
1962: /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1963: user may have called MatProductReplaceMats() to get this A=product->A */
1964: PetscCall(MatTransposeSetPrecursor(A, atb->At));
1965: PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1966: }
1967: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1968: }
1969: PetscFunctionReturn(PETSC_SUCCESS);
1970: }
1972: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1973: {
1974: Mat_Product *product = C->product;
1975: Mat A = product->A, B = product->B;
1976: PetscReal fill = product->fill;
1978: PetscFunctionBegin;
1979: PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
1981: C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1982: PetscFunctionReturn(PETSC_SUCCESS);
1983: }
1985: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1986: {
1987: Mat_Product *product = C->product;
1988: PetscInt alg = 0; /* default algorithm */
1989: PetscBool flg = PETSC_FALSE;
1990: #if !defined(PETSC_HAVE_HYPRE)
1991: const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1992: PetscInt nalg = 7;
1993: #else
1994: const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1995: PetscInt nalg = 8;
1996: #endif
1998: PetscFunctionBegin;
1999: /* Set default algorithm */
2000: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2001: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2003: /* Get runtime option */
2004: if (product->api_user) {
2005: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
2006: PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2007: PetscOptionsEnd();
2008: } else {
2009: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2010: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2011: PetscOptionsEnd();
2012: }
2013: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2015: C->ops->productsymbolic = MatProductSymbolic_AB;
2016: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2017: PetscFunctionReturn(PETSC_SUCCESS);
2018: }
2020: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2021: {
2022: Mat_Product *product = C->product;
2023: PetscInt alg = 0; /* default algorithm */
2024: PetscBool flg = PETSC_FALSE;
2025: const char *algTypes[3] = {"default", "at*b", "outerproduct"};
2026: PetscInt nalg = 3;
2028: PetscFunctionBegin;
2029: /* Get runtime option */
2030: if (product->api_user) {
2031: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2032: PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2033: PetscOptionsEnd();
2034: } else {
2035: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2036: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2037: PetscOptionsEnd();
2038: }
2039: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2041: C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2042: PetscFunctionReturn(PETSC_SUCCESS);
2043: }
2045: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2046: {
2047: Mat_Product *product = C->product;
2048: PetscInt alg = 0; /* default algorithm */
2049: PetscBool flg = PETSC_FALSE;
2050: const char *algTypes[2] = {"default", "color"};
2051: PetscInt nalg = 2;
2053: PetscFunctionBegin;
2054: /* Set default algorithm */
2055: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2056: if (!flg) {
2057: alg = 1;
2058: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2059: }
2061: /* Get runtime option */
2062: if (product->api_user) {
2063: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2064: PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2065: PetscOptionsEnd();
2066: } else {
2067: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2068: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2069: PetscOptionsEnd();
2070: }
2071: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2073: C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2074: C->ops->productsymbolic = MatProductSymbolic_ABt;
2075: PetscFunctionReturn(PETSC_SUCCESS);
2076: }
2078: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2079: {
2080: Mat_Product *product = C->product;
2081: PetscBool flg = PETSC_FALSE;
2082: PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */
2083: #if !defined(PETSC_HAVE_HYPRE)
2084: const char *algTypes[2] = {"scalable", "rap"};
2085: PetscInt nalg = 2;
2086: #else
2087: const char *algTypes[3] = {"scalable", "rap", "hypre"};
2088: PetscInt nalg = 3;
2089: #endif
2091: PetscFunctionBegin;
2092: /* Set default algorithm */
2093: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2094: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2096: /* Get runtime option */
2097: if (product->api_user) {
2098: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2099: PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2100: PetscOptionsEnd();
2101: } else {
2102: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2103: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2104: PetscOptionsEnd();
2105: }
2106: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2108: C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2109: PetscFunctionReturn(PETSC_SUCCESS);
2110: }
2112: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2113: {
2114: Mat_Product *product = C->product;
2115: PetscBool flg = PETSC_FALSE;
2116: PetscInt alg = 0; /* default algorithm */
2117: const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2118: PetscInt nalg = 3;
2120: PetscFunctionBegin;
2121: /* Set default algorithm */
2122: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2123: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2125: /* Get runtime option */
2126: if (product->api_user) {
2127: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2128: PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2129: PetscOptionsEnd();
2130: } else {
2131: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2132: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2133: PetscOptionsEnd();
2134: }
2135: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2137: C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2138: PetscFunctionReturn(PETSC_SUCCESS);
2139: }
2141: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2142: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2143: {
2144: Mat_Product *product = C->product;
2145: PetscInt alg = 0; /* default algorithm */
2146: PetscBool flg = PETSC_FALSE;
2147: const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2148: PetscInt nalg = 7;
2150: PetscFunctionBegin;
2151: /* Set default algorithm */
2152: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2153: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2155: /* Get runtime option */
2156: if (product->api_user) {
2157: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2158: PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2159: PetscOptionsEnd();
2160: } else {
2161: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2162: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2163: PetscOptionsEnd();
2164: }
2165: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2167: C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2168: C->ops->productsymbolic = MatProductSymbolic_ABC;
2169: PetscFunctionReturn(PETSC_SUCCESS);
2170: }
2172: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2173: {
2174: Mat_Product *product = C->product;
2176: PetscFunctionBegin;
2177: switch (product->type) {
2178: case MATPRODUCT_AB:
2179: PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2180: break;
2181: case MATPRODUCT_AtB:
2182: PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2183: break;
2184: case MATPRODUCT_ABt:
2185: PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2186: break;
2187: case MATPRODUCT_PtAP:
2188: PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2189: break;
2190: case MATPRODUCT_RARt:
2191: PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2192: break;
2193: case MATPRODUCT_ABC:
2194: PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2195: break;
2196: default:
2197: break;
2198: }
2199: PetscFunctionReturn(PETSC_SUCCESS);
2200: }