Actual source code: matmatmult.c
1: /*
2: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3: C = A * B
4: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/utils/freespace.h>
8: #include <petscbt.h>
9: #include <petsc/private/isimpl.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
12: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13: {
14: PetscFunctionBegin;
15: if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16: else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
17: PetscFunctionReturn(PETSC_SUCCESS);
18: }
20: /* Modified from MatCreateSeqAIJWithArrays() */
21: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22: {
23: PetscInt ii;
24: Mat_SeqAIJ *aij;
25: PetscBool isseqaij, ofree_a, ofree_ij;
27: PetscFunctionBegin;
28: PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
29: PetscCall(MatSetSizes(mat, m, n, m, n));
31: if (!mtype) {
32: PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
33: if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34: } else {
35: PetscCall(MatSetType(mat, mtype));
36: }
38: aij = (Mat_SeqAIJ *)mat->data;
39: ofree_a = aij->free_a;
40: ofree_ij = aij->free_ij;
41: /* changes the free flags */
42: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
44: PetscCall(PetscFree(aij->ilen));
45: PetscCall(PetscFree(aij->imax));
46: PetscCall(PetscMalloc1(m, &aij->imax));
47: PetscCall(PetscMalloc1(m, &aij->ilen));
48: for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49: const PetscInt rnz = i[ii + 1] - i[ii];
50: aij->nonzerorowcnt += !!rnz;
51: aij->rmax = PetscMax(aij->rmax, rnz);
52: aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53: }
54: aij->maxnz = i[m];
55: aij->nz = i[m];
57: if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
58: if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
59: if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
61: aij->i = i;
62: aij->j = j;
63: aij->a = a;
64: aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
65: aij->free_a = PETSC_FALSE;
66: aij->free_ij = PETSC_FALSE;
67: PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
68: // Always build the diag info when i, j are set
69: PetscCall(MatMarkDiagonal_SeqAIJ(mat));
70: PetscFunctionReturn(PETSC_SUCCESS);
71: }
73: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
74: {
75: Mat_Product *product = C->product;
76: MatProductAlgorithm alg;
77: PetscBool flg;
79: PetscFunctionBegin;
80: if (product) {
81: alg = product->alg;
82: } else {
83: alg = "sorted";
84: }
85: /* sorted */
86: PetscCall(PetscStrcmp(alg, "sorted", &flg));
87: if (flg) {
88: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
89: PetscFunctionReturn(PETSC_SUCCESS);
90: }
92: /* scalable */
93: PetscCall(PetscStrcmp(alg, "scalable", &flg));
94: if (flg) {
95: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
96: PetscFunctionReturn(PETSC_SUCCESS);
97: }
99: /* scalable_fast */
100: PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
101: if (flg) {
102: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
103: PetscFunctionReturn(PETSC_SUCCESS);
104: }
106: /* heap */
107: PetscCall(PetscStrcmp(alg, "heap", &flg));
108: if (flg) {
109: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
110: PetscFunctionReturn(PETSC_SUCCESS);
111: }
113: /* btheap */
114: PetscCall(PetscStrcmp(alg, "btheap", &flg));
115: if (flg) {
116: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
117: PetscFunctionReturn(PETSC_SUCCESS);
118: }
120: /* llcondensed */
121: PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
122: if (flg) {
123: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: /* rowmerge */
128: PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
129: if (flg) {
130: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
131: PetscFunctionReturn(PETSC_SUCCESS);
132: }
134: #if defined(PETSC_HAVE_HYPRE)
135: PetscCall(PetscStrcmp(alg, "hypre", &flg));
136: if (flg) {
137: PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
138: PetscFunctionReturn(PETSC_SUCCESS);
139: }
140: #endif
142: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
143: }
145: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
146: {
147: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
148: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
149: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
150: PetscReal afill;
151: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
152: PetscHMapI ta;
153: PetscBT lnkbt;
154: PetscFreeSpaceList free_space = NULL, current_space = NULL;
156: PetscFunctionBegin;
157: /* Get ci and cj */
158: /* Allocate ci array, arrays for fill computation and */
159: /* free space for accumulating nonzero column info */
160: PetscCall(PetscMalloc1(am + 2, &ci));
161: ci[0] = 0;
163: /* create and initialize a linked list */
164: PetscCall(PetscHMapICreateWithSize(bn, &ta));
165: MatRowMergeMax_SeqAIJ(b, bm, ta);
166: PetscCall(PetscHMapIGetSize(ta, &Crmax));
167: PetscCall(PetscHMapIDestroy(&ta));
169: PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
171: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
172: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
174: current_space = free_space;
176: /* Determine ci and cj */
177: for (i = 0; i < am; i++) {
178: anzi = ai[i + 1] - ai[i];
179: aj = a->j + ai[i];
180: for (j = 0; j < anzi; j++) {
181: brow = aj[j];
182: bnzj = bi[brow + 1] - bi[brow];
183: bj = b->j + bi[brow];
184: /* add non-zero cols of B into the sorted linked list lnk */
185: PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
186: }
187: /* add possible missing diagonal entry */
188: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
189: cnzi = lnk[0];
191: /* If free space is not available, make more free space */
192: /* Double the amount of total space in the list */
193: if (current_space->local_remaining < cnzi) {
194: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
195: ndouble++;
196: }
198: /* Copy data into free space, then initialize lnk */
199: PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
201: current_space->array += cnzi;
202: current_space->local_used += cnzi;
203: current_space->local_remaining -= cnzi;
205: ci[i + 1] = ci[i] + cnzi;
206: }
208: /* Column indices are in the list of free space */
209: /* Allocate space for cj, initialize cj, and */
210: /* destroy list of free space and other temporary array(s) */
211: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
212: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
213: PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
215: /* put together the new symbolic matrix */
216: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
217: PetscCall(MatSetBlockSizesFromMats(C, A, B));
219: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
220: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
221: c = (Mat_SeqAIJ *)C->data;
222: c->free_a = PETSC_FALSE;
223: c->free_ij = PETSC_TRUE;
224: c->nonew = 0;
226: /* fast, needs non-scalable O(bn) array 'abdense' */
227: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
229: /* set MatInfo */
230: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
231: if (afill < 1.0) afill = 1.0;
232: C->info.mallocs = ndouble;
233: C->info.fill_ratio_given = fill;
234: C->info.fill_ratio_needed = afill;
236: #if defined(PETSC_USE_INFO)
237: if (ci[am]) {
238: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
239: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
240: } else {
241: PetscCall(PetscInfo(C, "Empty matrix product\n"));
242: }
243: #endif
244: PetscFunctionReturn(PETSC_SUCCESS);
245: }
247: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
248: {
249: PetscLogDouble flops = 0.0;
250: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
251: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
252: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
253: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
254: PetscInt am = A->rmap->n, cm = C->rmap->n;
255: PetscInt i, j, k, anzi, bnzi, cnzi, brow;
256: PetscScalar *ca, valtmp;
257: PetscScalar *ab_dense;
258: PetscContainer cab_dense;
259: const PetscScalar *aa, *ba, *baj;
261: PetscFunctionBegin;
262: PetscCall(MatSeqAIJGetArrayRead(A, &aa));
263: PetscCall(MatSeqAIJGetArrayRead(B, &ba));
264: if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
265: PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
266: c->a = ca;
267: c->free_a = PETSC_TRUE;
268: } else ca = c->a;
270: /* TODO this should be done in the symbolic phase */
271: /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
272: that is hard to eradicate) */
273: PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
274: if (!cab_dense) {
275: PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
276: PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault));
277: } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
278: PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
280: /* clean old values in C */
281: PetscCall(PetscArrayzero(ca, ci[cm]));
282: /* Traverse A row-wise. */
283: /* Build the ith row in C by summing over nonzero columns in A, */
284: /* the rows of B corresponding to nonzeros of A. */
285: for (i = 0; i < am; i++) {
286: anzi = ai[i + 1] - ai[i];
287: for (j = 0; j < anzi; j++) {
288: brow = aj[j];
289: bnzi = bi[brow + 1] - bi[brow];
290: bjj = PetscSafePointerPlusOffset(bj, bi[brow]);
291: baj = PetscSafePointerPlusOffset(ba, bi[brow]);
292: /* perform dense axpy */
293: valtmp = aa[j];
294: for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
295: flops += 2 * bnzi;
296: }
297: aj = PetscSafePointerPlusOffset(aj, anzi);
298: aa = PetscSafePointerPlusOffset(aa, anzi);
300: cnzi = ci[i + 1] - ci[i];
301: for (k = 0; k < cnzi; k++) {
302: ca[k] += ab_dense[cj[k]];
303: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304: }
305: flops += cnzi;
306: cj = PetscSafePointerPlusOffset(cj, cnzi);
307: ca += cnzi;
308: }
309: #if defined(PETSC_HAVE_DEVICE)
310: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
311: #endif
312: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
313: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
314: PetscCall(PetscLogFlops(flops));
315: PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
316: PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
317: PetscFunctionReturn(PETSC_SUCCESS);
318: }
320: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
321: {
322: PetscLogDouble flops = 0.0;
323: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
324: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
325: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
326: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
327: PetscInt am = A->rmap->N, cm = C->rmap->N;
328: PetscInt i, j, k, anzi, bnzi, cnzi, brow;
329: PetscScalar *ca = c->a, valtmp;
330: const PetscScalar *aa, *ba, *baj;
331: PetscInt nextb;
333: PetscFunctionBegin;
334: PetscCall(MatSeqAIJGetArrayRead(A, &aa));
335: PetscCall(MatSeqAIJGetArrayRead(B, &ba));
336: if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
337: PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
338: c->a = ca;
339: c->free_a = PETSC_TRUE;
340: }
342: /* clean old values in C */
343: PetscCall(PetscArrayzero(ca, ci[cm]));
344: /* Traverse A row-wise. */
345: /* Build the ith row in C by summing over nonzero columns in A, */
346: /* the rows of B corresponding to nonzeros of A. */
347: for (i = 0; i < am; i++) {
348: anzi = ai[i + 1] - ai[i];
349: cnzi = ci[i + 1] - ci[i];
350: for (j = 0; j < anzi; j++) {
351: brow = aj[j];
352: bnzi = bi[brow + 1] - bi[brow];
353: bjj = bj + bi[brow];
354: baj = ba + bi[brow];
355: /* perform sparse axpy */
356: valtmp = aa[j];
357: nextb = 0;
358: for (k = 0; nextb < bnzi; k++) {
359: if (cj[k] == bjj[nextb]) { /* ccol == bcol */
360: ca[k] += valtmp * baj[nextb++];
361: }
362: }
363: flops += 2 * bnzi;
364: }
365: aj += anzi;
366: aa += anzi;
367: cj += cnzi;
368: ca += cnzi;
369: }
370: #if defined(PETSC_HAVE_DEVICE)
371: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
372: #endif
373: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
374: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
375: PetscCall(PetscLogFlops(flops));
376: PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
377: PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
378: PetscFunctionReturn(PETSC_SUCCESS);
379: }
381: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
382: {
383: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
384: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
385: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
386: MatScalar *ca;
387: PetscReal afill;
388: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
389: PetscHMapI ta;
390: PetscFreeSpaceList free_space = NULL, current_space = NULL;
392: PetscFunctionBegin;
393: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
394: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
395: PetscCall(PetscMalloc1(am + 2, &ci));
396: ci[0] = 0;
398: /* create and initialize a linked list */
399: PetscCall(PetscHMapICreateWithSize(bn, &ta));
400: MatRowMergeMax_SeqAIJ(b, bm, ta);
401: PetscCall(PetscHMapIGetSize(ta, &Crmax));
402: PetscCall(PetscHMapIDestroy(&ta));
404: PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
406: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
407: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
408: current_space = free_space;
410: /* Determine ci and cj */
411: for (i = 0; i < am; i++) {
412: anzi = ai[i + 1] - ai[i];
413: aj = a->j + ai[i];
414: for (j = 0; j < anzi; j++) {
415: brow = aj[j];
416: bnzj = bi[brow + 1] - bi[brow];
417: bj = b->j + bi[brow];
418: /* add non-zero cols of B into the sorted linked list lnk */
419: PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
420: }
421: /* add possible missing diagonal entry */
422: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
423: cnzi = lnk[1];
425: /* If free space is not available, make more free space */
426: /* Double the amount of total space in the list */
427: if (current_space->local_remaining < cnzi) {
428: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
429: ndouble++;
430: }
432: /* Copy data into free space, then initialize lnk */
433: PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
435: current_space->array += cnzi;
436: current_space->local_used += cnzi;
437: current_space->local_remaining -= cnzi;
439: ci[i + 1] = ci[i] + cnzi;
440: }
442: /* Column indices are in the list of free space */
443: /* Allocate space for cj, initialize cj, and */
444: /* destroy list of free space and other temporary array(s) */
445: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
446: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
447: PetscCall(PetscLLCondensedDestroy_fast(lnk));
449: /* Allocate space for ca */
450: PetscCall(PetscCalloc1(ci[am] + 1, &ca));
452: /* put together the new symbolic matrix */
453: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
454: PetscCall(MatSetBlockSizesFromMats(C, A, B));
456: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
457: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
458: c = (Mat_SeqAIJ *)C->data;
459: c->free_a = PETSC_TRUE;
460: c->free_ij = PETSC_TRUE;
461: c->nonew = 0;
463: /* slower, less memory */
464: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
466: /* set MatInfo */
467: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
468: if (afill < 1.0) afill = 1.0;
469: C->info.mallocs = ndouble;
470: C->info.fill_ratio_given = fill;
471: C->info.fill_ratio_needed = afill;
473: #if defined(PETSC_USE_INFO)
474: if (ci[am]) {
475: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
476: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
477: } else {
478: PetscCall(PetscInfo(C, "Empty matrix product\n"));
479: }
480: #endif
481: PetscFunctionReturn(PETSC_SUCCESS);
482: }
484: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
485: {
486: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
487: PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
488: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
489: MatScalar *ca;
490: PetscReal afill;
491: PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
492: PetscHMapI ta;
493: PetscFreeSpaceList free_space = NULL, current_space = NULL;
495: PetscFunctionBegin;
496: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
497: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
498: PetscCall(PetscMalloc1(am + 2, &ci));
499: ci[0] = 0;
501: /* create and initialize a linked list */
502: PetscCall(PetscHMapICreateWithSize(bn, &ta));
503: MatRowMergeMax_SeqAIJ(b, bm, ta);
504: PetscCall(PetscHMapIGetSize(ta, &Crmax));
505: PetscCall(PetscHMapIDestroy(&ta));
506: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
508: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
509: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
510: current_space = free_space;
512: /* Determine ci and cj */
513: for (i = 0; i < am; i++) {
514: anzi = ai[i + 1] - ai[i];
515: aj = a->j + ai[i];
516: for (j = 0; j < anzi; j++) {
517: brow = aj[j];
518: bnzj = bi[brow + 1] - bi[brow];
519: bj = b->j + bi[brow];
520: /* add non-zero cols of B into the sorted linked list lnk */
521: PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
522: }
523: /* add possible missing diagonal entry */
524: if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
526: cnzi = lnk[0];
528: /* If free space is not available, make more free space */
529: /* Double the amount of total space in the list */
530: if (current_space->local_remaining < cnzi) {
531: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
532: ndouble++;
533: }
535: /* Copy data into free space, then initialize lnk */
536: PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
538: current_space->array += cnzi;
539: current_space->local_used += cnzi;
540: current_space->local_remaining -= cnzi;
542: ci[i + 1] = ci[i] + cnzi;
543: }
545: /* Column indices are in the list of free space */
546: /* Allocate space for cj, initialize cj, and */
547: /* destroy list of free space and other temporary array(s) */
548: PetscCall(PetscMalloc1(ci[am] + 1, &cj));
549: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
550: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
552: /* Allocate space for ca */
553: PetscCall(PetscCalloc1(ci[am] + 1, &ca));
555: /* put together the new symbolic matrix */
556: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
557: PetscCall(MatSetBlockSizesFromMats(C, A, B));
559: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
560: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
561: c = (Mat_SeqAIJ *)C->data;
562: c->free_a = PETSC_TRUE;
563: c->free_ij = PETSC_TRUE;
564: c->nonew = 0;
566: /* slower, less memory */
567: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
569: /* set MatInfo */
570: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
571: if (afill < 1.0) afill = 1.0;
572: C->info.mallocs = ndouble;
573: C->info.fill_ratio_given = fill;
574: C->info.fill_ratio_needed = afill;
576: #if defined(PETSC_USE_INFO)
577: if (ci[am]) {
578: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
579: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
580: } else {
581: PetscCall(PetscInfo(C, "Empty matrix product\n"));
582: }
583: #endif
584: PetscFunctionReturn(PETSC_SUCCESS);
585: }
587: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
588: {
589: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
590: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
591: PetscInt *ci, *cj, *bb;
592: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
593: PetscReal afill;
594: PetscInt i, j, col, ndouble = 0;
595: PetscFreeSpaceList free_space = NULL, current_space = NULL;
596: PetscHeap h;
598: PetscFunctionBegin;
599: /* Get ci and cj - by merging sorted rows using a heap */
600: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
601: PetscCall(PetscMalloc1(am + 2, &ci));
602: ci[0] = 0;
604: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
605: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
606: current_space = free_space;
608: PetscCall(PetscHeapCreate(a->rmax, &h));
609: PetscCall(PetscMalloc1(a->rmax, &bb));
611: /* Determine ci and cj */
612: for (i = 0; i < am; i++) {
613: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
614: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
615: ci[i + 1] = ci[i];
616: /* Populate the min heap */
617: for (j = 0; j < anzi; j++) {
618: bb[j] = bi[acol[j]]; /* bb points at the start of the row */
619: if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
620: PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
621: }
622: }
623: /* Pick off the min element, adding it to free space */
624: PetscCall(PetscHeapPop(h, &j, &col));
625: while (j >= 0) {
626: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
627: PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
628: ndouble++;
629: }
630: *(current_space->array++) = col;
631: current_space->local_used++;
632: current_space->local_remaining--;
633: ci[i + 1]++;
635: /* stash if anything else remains in this row of B */
636: if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
637: while (1) { /* pop and stash any other rows of B that also had an entry in this column */
638: PetscInt j2, col2;
639: PetscCall(PetscHeapPeek(h, &j2, &col2));
640: if (col2 != col) break;
641: PetscCall(PetscHeapPop(h, &j2, &col2));
642: if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
643: }
644: /* Put any stashed elements back into the min heap */
645: PetscCall(PetscHeapUnstash(h));
646: PetscCall(PetscHeapPop(h, &j, &col));
647: }
648: }
649: PetscCall(PetscFree(bb));
650: PetscCall(PetscHeapDestroy(&h));
652: /* Column indices are in the list of free space */
653: /* Allocate space for cj, initialize cj, and */
654: /* destroy list of free space and other temporary array(s) */
655: PetscCall(PetscMalloc1(ci[am], &cj));
656: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
658: /* put together the new symbolic matrix */
659: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
660: PetscCall(MatSetBlockSizesFromMats(C, A, B));
662: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
663: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
664: c = (Mat_SeqAIJ *)C->data;
665: c->free_a = PETSC_TRUE;
666: c->free_ij = PETSC_TRUE;
667: c->nonew = 0;
669: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
671: /* set MatInfo */
672: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
673: if (afill < 1.0) afill = 1.0;
674: C->info.mallocs = ndouble;
675: C->info.fill_ratio_given = fill;
676: C->info.fill_ratio_needed = afill;
678: #if defined(PETSC_USE_INFO)
679: if (ci[am]) {
680: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
681: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
682: } else {
683: PetscCall(PetscInfo(C, "Empty matrix product\n"));
684: }
685: #endif
686: PetscFunctionReturn(PETSC_SUCCESS);
687: }
689: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
690: {
691: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
692: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
693: PetscInt *ci, *cj, *bb;
694: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
695: PetscReal afill;
696: PetscInt i, j, col, ndouble = 0;
697: PetscFreeSpaceList free_space = NULL, current_space = NULL;
698: PetscHeap h;
699: PetscBT bt;
701: PetscFunctionBegin;
702: /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
703: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
704: PetscCall(PetscMalloc1(am + 2, &ci));
705: ci[0] = 0;
707: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
708: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
710: current_space = free_space;
712: PetscCall(PetscHeapCreate(a->rmax, &h));
713: PetscCall(PetscMalloc1(a->rmax, &bb));
714: PetscCall(PetscBTCreate(bn, &bt));
716: /* Determine ci and cj */
717: for (i = 0; i < am; i++) {
718: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
719: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
720: const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
721: ci[i + 1] = ci[i];
722: /* Populate the min heap */
723: for (j = 0; j < anzi; j++) {
724: PetscInt brow = acol[j];
725: for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
726: PetscInt bcol = bj[bb[j]];
727: if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
728: PetscCall(PetscHeapAdd(h, j, bcol));
729: bb[j]++;
730: break;
731: }
732: }
733: }
734: /* Pick off the min element, adding it to free space */
735: PetscCall(PetscHeapPop(h, &j, &col));
736: while (j >= 0) {
737: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
738: fptr = NULL; /* need PetscBTMemzero */
739: PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
740: ndouble++;
741: }
742: *(current_space->array++) = col;
743: current_space->local_used++;
744: current_space->local_remaining--;
745: ci[i + 1]++;
747: /* stash if anything else remains in this row of B */
748: for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
749: PetscInt bcol = bj[bb[j]];
750: if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
751: PetscCall(PetscHeapAdd(h, j, bcol));
752: bb[j]++;
753: break;
754: }
755: }
756: PetscCall(PetscHeapPop(h, &j, &col));
757: }
758: if (fptr) { /* Clear the bits for this row */
759: for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
760: } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
761: PetscCall(PetscBTMemzero(bn, bt));
762: }
763: }
764: PetscCall(PetscFree(bb));
765: PetscCall(PetscHeapDestroy(&h));
766: PetscCall(PetscBTDestroy(&bt));
768: /* Column indices are in the list of free space */
769: /* Allocate space for cj, initialize cj, and */
770: /* destroy list of free space and other temporary array(s) */
771: PetscCall(PetscMalloc1(ci[am], &cj));
772: PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
774: /* put together the new symbolic matrix */
775: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
776: PetscCall(MatSetBlockSizesFromMats(C, A, B));
778: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
779: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
780: c = (Mat_SeqAIJ *)C->data;
781: c->free_a = PETSC_TRUE;
782: c->free_ij = PETSC_TRUE;
783: c->nonew = 0;
785: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
787: /* set MatInfo */
788: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
789: if (afill < 1.0) afill = 1.0;
790: C->info.mallocs = ndouble;
791: C->info.fill_ratio_given = fill;
792: C->info.fill_ratio_needed = afill;
794: #if defined(PETSC_USE_INFO)
795: if (ci[am]) {
796: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
797: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
798: } else {
799: PetscCall(PetscInfo(C, "Empty matrix product\n"));
800: }
801: #endif
802: PetscFunctionReturn(PETSC_SUCCESS);
803: }
805: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
806: {
807: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
808: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
809: PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
810: PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz;
811: const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
812: const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
813: const PetscInt *brow_ptr[8], *brow_end[8];
814: PetscInt window[8];
815: PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
816: PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft;
817: PetscReal afill;
818: PetscInt *workj_L1, *workj_L2, *workj_L3;
819: PetscInt L1_nnz, L2_nnz;
821: /* Step 1: Get upper bound on memory required for allocation.
822: Because of the way virtual memory works,
823: only the memory pages that are actually needed will be physically allocated. */
824: PetscFunctionBegin;
825: PetscCall(PetscMalloc1(am + 1, &ci));
826: for (i = 0; i < am; i++) {
827: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
828: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
829: a_rownnz = 0;
830: for (k = 0; k < anzi; ++k) {
831: a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
832: if (a_rownnz > bn) {
833: a_rownnz = bn;
834: break;
835: }
836: }
837: a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
838: }
839: /* temporary work areas for merging rows */
840: PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
841: PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
842: PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
844: /* This should be enough for almost all matrices. If not, memory is reallocated later. */
845: c_maxmem = 8 * (ai[am] + bi[bm]);
846: /* Step 2: Populate pattern for C */
847: PetscCall(PetscMalloc1(c_maxmem, &cj));
849: ci_nnz = 0;
850: ci[0] = 0;
851: worki_L1[0] = 0;
852: worki_L2[0] = 0;
853: for (i = 0; i < am; i++) {
854: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
856: rowsleft = anzi;
857: inputcol_L1 = acol;
858: L2_nnz = 0;
859: L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */
860: worki_L2[1] = 0;
861: outputi_nnz = 0;
863: /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */
864: while (ci_nnz + a_maxrownnz > c_maxmem) {
865: c_maxmem *= 2;
866: ndouble++;
867: PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
868: }
870: while (rowsleft) {
871: L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
872: L1_nrows = 0;
873: L1_nnz = 0;
874: inputcol = inputcol_L1;
875: inputi = bi;
876: inputj = bj;
878: /* The following macro is used to specialize for small rows in A.
879: This helps with compiler unrolling, improving performance substantially.
880: Input: inputj inputi inputcol bn
881: Output: outputj outputi_nnz */
882: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
883: do { \
884: window_min = bn; \
885: outputi_nnz = 0; \
886: for (k = 0; k < ANNZ; ++k) { \
887: brow_ptr[k] = inputj + inputi[inputcol[k]]; \
888: brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
889: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
890: window_min = PetscMin(window[k], window_min); \
891: } \
892: while (window_min < bn) { \
893: outputj[outputi_nnz++] = window_min; \
894: /* advance front and compute new minimum */ \
895: old_window_min = window_min; \
896: window_min = bn; \
897: for (k = 0; k < ANNZ; ++k) { \
898: if (window[k] == old_window_min) { \
899: brow_ptr[k]++; \
900: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
901: } \
902: window_min = PetscMin(window[k], window_min); \
903: } \
904: } \
905: } while (0)
907: /************** L E V E L 1 ***************/
908: /* Merge up to 8 rows of B to L1 work array*/
909: while (L1_rowsleft) {
910: outputi_nnz = 0;
911: if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
912: else outputj = cj + ci_nnz; /* Merge directly to C */
914: switch (L1_rowsleft) {
915: case 1:
916: brow_ptr[0] = inputj + inputi[inputcol[0]];
917: brow_end[0] = inputj + inputi[inputcol[0] + 1];
918: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
919: inputcol += L1_rowsleft;
920: rowsleft -= L1_rowsleft;
921: L1_rowsleft = 0;
922: break;
923: case 2:
924: MatMatMultSymbolic_RowMergeMacro(2);
925: inputcol += L1_rowsleft;
926: rowsleft -= L1_rowsleft;
927: L1_rowsleft = 0;
928: break;
929: case 3:
930: MatMatMultSymbolic_RowMergeMacro(3);
931: inputcol += L1_rowsleft;
932: rowsleft -= L1_rowsleft;
933: L1_rowsleft = 0;
934: break;
935: case 4:
936: MatMatMultSymbolic_RowMergeMacro(4);
937: inputcol += L1_rowsleft;
938: rowsleft -= L1_rowsleft;
939: L1_rowsleft = 0;
940: break;
941: case 5:
942: MatMatMultSymbolic_RowMergeMacro(5);
943: inputcol += L1_rowsleft;
944: rowsleft -= L1_rowsleft;
945: L1_rowsleft = 0;
946: break;
947: case 6:
948: MatMatMultSymbolic_RowMergeMacro(6);
949: inputcol += L1_rowsleft;
950: rowsleft -= L1_rowsleft;
951: L1_rowsleft = 0;
952: break;
953: case 7:
954: MatMatMultSymbolic_RowMergeMacro(7);
955: inputcol += L1_rowsleft;
956: rowsleft -= L1_rowsleft;
957: L1_rowsleft = 0;
958: break;
959: default:
960: MatMatMultSymbolic_RowMergeMacro(8);
961: inputcol += 8;
962: rowsleft -= 8;
963: L1_rowsleft -= 8;
964: break;
965: }
966: inputcol_L1 = inputcol;
967: L1_nnz += outputi_nnz;
968: worki_L1[++L1_nrows] = L1_nnz;
969: }
971: /********************** L E V E L 2 ************************/
972: /* Merge from L1 work array to either C or to L2 work array */
973: if (anzi > 8) {
974: inputi = worki_L1;
975: inputj = workj_L1;
976: inputcol = workcol;
977: outputi_nnz = 0;
979: if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
980: else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */
982: switch (L1_nrows) {
983: case 1:
984: brow_ptr[0] = inputj + inputi[inputcol[0]];
985: brow_end[0] = inputj + inputi[inputcol[0] + 1];
986: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
987: break;
988: case 2:
989: MatMatMultSymbolic_RowMergeMacro(2);
990: break;
991: case 3:
992: MatMatMultSymbolic_RowMergeMacro(3);
993: break;
994: case 4:
995: MatMatMultSymbolic_RowMergeMacro(4);
996: break;
997: case 5:
998: MatMatMultSymbolic_RowMergeMacro(5);
999: break;
1000: case 6:
1001: MatMatMultSymbolic_RowMergeMacro(6);
1002: break;
1003: case 7:
1004: MatMatMultSymbolic_RowMergeMacro(7);
1005: break;
1006: case 8:
1007: MatMatMultSymbolic_RowMergeMacro(8);
1008: break;
1009: default:
1010: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1011: }
1012: L2_nnz += outputi_nnz;
1013: worki_L2[++L2_nrows] = L2_nnz;
1015: /************************ L E V E L 3 **********************/
1016: /* Merge from L2 work array to either C or to L2 work array */
1017: if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1018: inputi = worki_L2;
1019: inputj = workj_L2;
1020: inputcol = workcol;
1021: outputi_nnz = 0;
1022: if (rowsleft) outputj = workj_L3;
1023: else outputj = cj + ci_nnz;
1024: switch (L2_nrows) {
1025: case 1:
1026: brow_ptr[0] = inputj + inputi[inputcol[0]];
1027: brow_end[0] = inputj + inputi[inputcol[0] + 1];
1028: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1029: break;
1030: case 2:
1031: MatMatMultSymbolic_RowMergeMacro(2);
1032: break;
1033: case 3:
1034: MatMatMultSymbolic_RowMergeMacro(3);
1035: break;
1036: case 4:
1037: MatMatMultSymbolic_RowMergeMacro(4);
1038: break;
1039: case 5:
1040: MatMatMultSymbolic_RowMergeMacro(5);
1041: break;
1042: case 6:
1043: MatMatMultSymbolic_RowMergeMacro(6);
1044: break;
1045: case 7:
1046: MatMatMultSymbolic_RowMergeMacro(7);
1047: break;
1048: case 8:
1049: MatMatMultSymbolic_RowMergeMacro(8);
1050: break;
1051: default:
1052: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1053: }
1054: L2_nrows = 1;
1055: L2_nnz = outputi_nnz;
1056: worki_L2[1] = outputi_nnz;
1057: /* Copy to workj_L2 */
1058: if (rowsleft) {
1059: for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1060: }
1061: }
1062: }
1063: } /* while (rowsleft) */
1064: #undef MatMatMultSymbolic_RowMergeMacro
1066: /* terminate current row */
1067: ci_nnz += outputi_nnz;
1068: ci[i + 1] = ci_nnz;
1069: }
1071: /* Step 3: Create the new symbolic matrix */
1072: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1073: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1075: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1076: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1077: c = (Mat_SeqAIJ *)C->data;
1078: c->free_a = PETSC_TRUE;
1079: c->free_ij = PETSC_TRUE;
1080: c->nonew = 0;
1082: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1084: /* set MatInfo */
1085: afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1086: if (afill < 1.0) afill = 1.0;
1087: C->info.mallocs = ndouble;
1088: C->info.fill_ratio_given = fill;
1089: C->info.fill_ratio_needed = afill;
1091: #if defined(PETSC_USE_INFO)
1092: if (ci[am]) {
1093: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1094: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1095: } else {
1096: PetscCall(PetscInfo(C, "Empty matrix product\n"));
1097: }
1098: #endif
1100: /* Step 4: Free temporary work areas */
1101: PetscCall(PetscFree(workj_L1));
1102: PetscCall(PetscFree(workj_L2));
1103: PetscCall(PetscFree(workj_L3));
1104: PetscFunctionReturn(PETSC_SUCCESS);
1105: }
1107: /* concatenate unique entries and then sort */
1108: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1109: {
1110: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1111: const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1112: PetscInt *ci, *cj, bcol;
1113: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1114: PetscReal afill;
1115: PetscInt i, j, ndouble = 0;
1116: PetscSegBuffer seg, segrow;
1117: char *seen;
1119: PetscFunctionBegin;
1120: PetscCall(PetscMalloc1(am + 1, &ci));
1121: ci[0] = 0;
1123: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1124: PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1125: PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1126: PetscCall(PetscCalloc1(bn, &seen));
1128: /* Determine ci and cj */
1129: for (i = 0; i < am; i++) {
1130: const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1131: const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1132: PetscInt packlen = 0, *PETSC_RESTRICT crow;
1134: /* Pack segrow */
1135: for (j = 0; j < anzi; j++) {
1136: PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1137: for (k = bjstart; k < bjend; k++) {
1138: bcol = bj[k];
1139: if (!seen[bcol]) { /* new entry */
1140: PetscInt *PETSC_RESTRICT slot;
1141: PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1142: *slot = bcol;
1143: seen[bcol] = 1;
1144: packlen++;
1145: }
1146: }
1147: }
1149: /* Check i-th diagonal entry */
1150: if (C->force_diagonals && !seen[i]) {
1151: PetscInt *PETSC_RESTRICT slot;
1152: PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1153: *slot = i;
1154: seen[i] = 1;
1155: packlen++;
1156: }
1158: PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1159: PetscCall(PetscSegBufferExtractTo(segrow, crow));
1160: PetscCall(PetscSortInt(packlen, crow));
1161: ci[i + 1] = ci[i] + packlen;
1162: for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1163: }
1164: PetscCall(PetscSegBufferDestroy(&segrow));
1165: PetscCall(PetscFree(seen));
1167: /* Column indices are in the segmented buffer */
1168: PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1169: PetscCall(PetscSegBufferDestroy(&seg));
1171: /* put together the new symbolic matrix */
1172: PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1173: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1175: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1176: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1177: c = (Mat_SeqAIJ *)C->data;
1178: c->free_a = PETSC_TRUE;
1179: c->free_ij = PETSC_TRUE;
1180: c->nonew = 0;
1182: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1184: /* set MatInfo */
1185: afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1186: if (afill < 1.0) afill = 1.0;
1187: C->info.mallocs = ndouble;
1188: C->info.fill_ratio_given = fill;
1189: C->info.fill_ratio_needed = afill;
1191: #if defined(PETSC_USE_INFO)
1192: if (ci[am]) {
1193: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1194: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1195: } else {
1196: PetscCall(PetscInfo(C, "Empty matrix product\n"));
1197: }
1198: #endif
1199: PetscFunctionReturn(PETSC_SUCCESS);
1200: }
1202: static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1203: {
1204: Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
1206: PetscFunctionBegin;
1207: PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1208: PetscCall(MatDestroy(&abt->Bt_den));
1209: PetscCall(MatDestroy(&abt->ABt_den));
1210: PetscCall(PetscFree(abt));
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1215: {
1216: Mat Bt;
1217: Mat_MatMatTransMult *abt;
1218: Mat_Product *product = C->product;
1219: char *alg;
1221: PetscFunctionBegin;
1222: PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1223: PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1225: /* create symbolic Bt */
1226: PetscCall(MatTransposeSymbolic(B, &Bt));
1227: PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1228: PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
1230: /* get symbolic C=A*Bt */
1231: PetscCall(PetscStrallocpy(product->alg, &alg));
1232: PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1233: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1234: PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1235: PetscCall(PetscFree(alg));
1237: /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1238: PetscCall(PetscNew(&abt));
1240: product->data = abt;
1241: product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1243: C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1245: abt->usecoloring = PETSC_FALSE;
1246: PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1247: if (abt->usecoloring) {
1248: /* Create MatTransposeColoring from symbolic C=A*B^T */
1249: MatTransposeColoring matcoloring;
1250: MatColoring coloring;
1251: ISColoring iscoloring;
1252: Mat Bt_dense, C_dense;
1254: /* inode causes memory problem */
1255: PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
1257: PetscCall(MatColoringCreate(C, &coloring));
1258: PetscCall(MatColoringSetDistance(coloring, 2));
1259: PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1260: PetscCall(MatColoringSetFromOptions(coloring));
1261: PetscCall(MatColoringApply(coloring, &iscoloring));
1262: PetscCall(MatColoringDestroy(&coloring));
1263: PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
1265: abt->matcoloring = matcoloring;
1267: PetscCall(ISColoringDestroy(&iscoloring));
1269: /* Create Bt_dense and C_dense = A*Bt_dense */
1270: PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1271: PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1272: PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1273: PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
1275: Bt_dense->assembled = PETSC_TRUE;
1276: abt->Bt_den = Bt_dense;
1278: PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1279: PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1280: PetscCall(MatSetType(C_dense, MATSEQDENSE));
1281: PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
1283: Bt_dense->assembled = PETSC_TRUE;
1284: abt->ABt_den = C_dense;
1286: #if defined(PETSC_USE_INFO)
1287: {
1288: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1289: PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1290: Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1291: }
1292: #endif
1293: }
1294: /* clean up */
1295: PetscCall(MatDestroy(&Bt));
1296: PetscFunctionReturn(PETSC_SUCCESS);
1297: }
1299: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1300: {
1301: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1302: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1303: PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1304: PetscLogDouble flops = 0.0;
1305: MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1306: Mat_MatMatTransMult *abt;
1307: Mat_Product *product = C->product;
1309: PetscFunctionBegin;
1310: PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1311: abt = (Mat_MatMatTransMult *)product->data;
1312: PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1313: /* clear old values in C */
1314: if (!c->a) {
1315: PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1316: c->a = ca;
1317: c->free_a = PETSC_TRUE;
1318: } else {
1319: ca = c->a;
1320: PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1321: }
1323: if (abt->usecoloring) {
1324: MatTransposeColoring matcoloring = abt->matcoloring;
1325: Mat Bt_dense, C_dense = abt->ABt_den;
1327: /* Get Bt_dense by Apply MatTransposeColoring to B */
1328: Bt_dense = abt->Bt_den;
1329: PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1331: /* C_dense = A*Bt_dense */
1332: PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1334: /* Recover C from C_dense */
1335: PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: for (i = 0; i < cm; i++) {
1340: anzi = ai[i + 1] - ai[i];
1341: acol = PetscSafePointerPlusOffset(aj, ai[i]);
1342: aval = PetscSafePointerPlusOffset(aa, ai[i]);
1343: cnzi = ci[i + 1] - ci[i];
1344: ccol = PetscSafePointerPlusOffset(cj, ci[i]);
1345: cval = ca + ci[i];
1346: for (j = 0; j < cnzi; j++) {
1347: brow = ccol[j];
1348: bnzj = bi[brow + 1] - bi[brow];
1349: bcol = bj + bi[brow];
1350: bval = ba + bi[brow];
1352: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1353: nexta = 0;
1354: nextb = 0;
1355: while (nexta < anzi && nextb < bnzj) {
1356: while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1357: if (nexta == anzi) break;
1358: while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1359: if (nextb == bnzj) break;
1360: if (acol[nexta] == bcol[nextb]) {
1361: cval[j] += aval[nexta] * bval[nextb];
1362: nexta++;
1363: nextb++;
1364: flops += 2;
1365: }
1366: }
1367: }
1368: }
1369: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1370: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1371: PetscCall(PetscLogFlops(flops));
1372: PetscFunctionReturn(PETSC_SUCCESS);
1373: }
1375: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1376: {
1377: Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
1379: PetscFunctionBegin;
1380: PetscCall(MatDestroy(&atb->At));
1381: if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1382: PetscCall(PetscFree(atb));
1383: PetscFunctionReturn(PETSC_SUCCESS);
1384: }
1386: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1387: {
1388: Mat At = NULL;
1389: Mat_Product *product = C->product;
1390: PetscBool flg, def, square;
1392: PetscFunctionBegin;
1393: MatCheckProduct(C, 4);
1394: square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1395: /* outerproduct */
1396: PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1397: if (flg) {
1398: /* create symbolic At */
1399: if (!square) {
1400: PetscCall(MatTransposeSymbolic(A, &At));
1401: PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1402: PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1403: }
1404: /* get symbolic C=At*B */
1405: PetscCall(MatProductSetAlgorithm(C, "sorted"));
1406: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1408: /* clean up */
1409: if (!square) PetscCall(MatDestroy(&At));
1411: C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1412: PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1413: PetscFunctionReturn(PETSC_SUCCESS);
1414: }
1416: /* matmatmult */
1417: PetscCall(PetscStrcmp(product->alg, "default", &def));
1418: PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1419: if (flg || def) {
1420: Mat_MatTransMatMult *atb;
1422: PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1423: PetscCall(PetscNew(&atb));
1424: if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1425: PetscCall(MatProductSetAlgorithm(C, "sorted"));
1426: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1427: PetscCall(MatProductSetAlgorithm(C, "at*b"));
1428: product->data = atb;
1429: product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1430: atb->At = At;
1432: C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1433: PetscFunctionReturn(PETSC_SUCCESS);
1434: }
1436: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1437: }
1439: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1440: {
1441: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1442: PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1443: PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1444: PetscLogDouble flops = 0.0;
1445: MatScalar *aa = a->a, *ba, *ca, *caj;
1447: PetscFunctionBegin;
1448: if (!c->a) {
1449: PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1451: c->a = ca;
1452: c->free_a = PETSC_TRUE;
1453: } else {
1454: ca = c->a;
1455: PetscCall(PetscArrayzero(ca, ci[cm]));
1456: }
1458: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1459: for (i = 0; i < am; i++) {
1460: bj = b->j + bi[i];
1461: ba = b->a + bi[i];
1462: bnzi = bi[i + 1] - bi[i];
1463: anzi = ai[i + 1] - ai[i];
1464: for (j = 0; j < anzi; j++) {
1465: nextb = 0;
1466: crow = *aj++;
1467: cjj = cj + ci[crow];
1468: caj = ca + ci[crow];
1469: /* perform sparse axpy operation. Note cjj includes bj. */
1470: for (k = 0; nextb < bnzi; k++) {
1471: if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1472: caj[k] += (*aa) * (*(ba + nextb));
1473: nextb++;
1474: }
1475: }
1476: flops += 2 * bnzi;
1477: aa++;
1478: }
1479: }
1481: /* Assemble the final matrix and clean up */
1482: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1483: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1484: PetscCall(PetscLogFlops(flops));
1485: PetscFunctionReturn(PETSC_SUCCESS);
1486: }
1488: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1489: {
1490: PetscFunctionBegin;
1491: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1492: C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1493: PetscFunctionReturn(PETSC_SUCCESS);
1494: }
1496: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1497: {
1498: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1499: PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1500: const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1501: const PetscInt *aj;
1502: PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1503: PetscInt clda;
1504: PetscInt am4, bm4, col, i, j, n;
1506: PetscFunctionBegin;
1507: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1508: PetscCall(MatSeqAIJGetArrayRead(A, &av));
1509: if (add) {
1510: PetscCall(MatDenseGetArray(C, &c));
1511: } else {
1512: PetscCall(MatDenseGetArrayWrite(C, &c));
1513: }
1514: PetscCall(MatDenseGetArrayRead(B, &b));
1515: PetscCall(MatDenseGetLDA(B, &bm));
1516: PetscCall(MatDenseGetLDA(C, &clda));
1517: am4 = 4 * clda;
1518: bm4 = 4 * bm;
1519: if (b) {
1520: b1 = b;
1521: b2 = b1 + bm;
1522: b3 = b2 + bm;
1523: b4 = b3 + bm;
1524: } else b1 = b2 = b3 = b4 = NULL;
1525: c1 = c;
1526: c2 = c1 + clda;
1527: c3 = c2 + clda;
1528: c4 = c3 + clda;
1529: for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1530: for (i = 0; i < am; i++) { /* over rows of A in those columns */
1531: r1 = r2 = r3 = r4 = 0.0;
1532: n = a->i[i + 1] - a->i[i];
1533: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1534: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1535: for (j = 0; j < n; j++) {
1536: const PetscScalar aatmp = aa[j];
1537: const PetscInt ajtmp = aj[j];
1538: r1 += aatmp * b1[ajtmp];
1539: r2 += aatmp * b2[ajtmp];
1540: r3 += aatmp * b3[ajtmp];
1541: r4 += aatmp * b4[ajtmp];
1542: }
1543: if (add) {
1544: c1[i] += r1;
1545: c2[i] += r2;
1546: c3[i] += r3;
1547: c4[i] += r4;
1548: } else {
1549: c1[i] = r1;
1550: c2[i] = r2;
1551: c3[i] = r3;
1552: c4[i] = r4;
1553: }
1554: }
1555: if (b) {
1556: b1 += bm4;
1557: b2 += bm4;
1558: b3 += bm4;
1559: b4 += bm4;
1560: }
1561: c1 += am4;
1562: c2 += am4;
1563: c3 += am4;
1564: c4 += am4;
1565: }
1566: /* process remaining columns */
1567: if (col != cn) {
1568: PetscInt rc = cn - col;
1570: if (rc == 1) {
1571: for (i = 0; i < am; i++) {
1572: r1 = 0.0;
1573: n = a->i[i + 1] - a->i[i];
1574: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1575: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1576: for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1577: if (add) c1[i] += r1;
1578: else c1[i] = r1;
1579: }
1580: } else if (rc == 2) {
1581: for (i = 0; i < am; i++) {
1582: r1 = r2 = 0.0;
1583: n = a->i[i + 1] - a->i[i];
1584: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1585: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1586: for (j = 0; j < n; j++) {
1587: const PetscScalar aatmp = aa[j];
1588: const PetscInt ajtmp = aj[j];
1589: r1 += aatmp * b1[ajtmp];
1590: r2 += aatmp * b2[ajtmp];
1591: }
1592: if (add) {
1593: c1[i] += r1;
1594: c2[i] += r2;
1595: } else {
1596: c1[i] = r1;
1597: c2[i] = r2;
1598: }
1599: }
1600: } else {
1601: for (i = 0; i < am; i++) {
1602: r1 = r2 = r3 = 0.0;
1603: n = a->i[i + 1] - a->i[i];
1604: aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1605: aa = PetscSafePointerPlusOffset(av, a->i[i]);
1606: for (j = 0; j < n; j++) {
1607: const PetscScalar aatmp = aa[j];
1608: const PetscInt ajtmp = aj[j];
1609: r1 += aatmp * b1[ajtmp];
1610: r2 += aatmp * b2[ajtmp];
1611: r3 += aatmp * b3[ajtmp];
1612: }
1613: if (add) {
1614: c1[i] += r1;
1615: c2[i] += r2;
1616: c3[i] += r3;
1617: } else {
1618: c1[i] = r1;
1619: c2[i] = r2;
1620: c3[i] = r3;
1621: }
1622: }
1623: }
1624: }
1625: PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1626: if (add) {
1627: PetscCall(MatDenseRestoreArray(C, &c));
1628: } else {
1629: PetscCall(MatDenseRestoreArrayWrite(C, &c));
1630: }
1631: PetscCall(MatDenseRestoreArrayRead(B, &b));
1632: PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1633: PetscFunctionReturn(PETSC_SUCCESS);
1634: }
1636: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1637: {
1638: PetscFunctionBegin;
1639: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1640: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1641: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1643: PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1644: PetscFunctionReturn(PETSC_SUCCESS);
1645: }
1647: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1648: {
1649: PetscFunctionBegin;
1650: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1651: C->ops->productsymbolic = MatProductSymbolic_AB;
1652: PetscFunctionReturn(PETSC_SUCCESS);
1653: }
1655: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
1657: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1658: {
1659: PetscFunctionBegin;
1660: C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1661: C->ops->productsymbolic = MatProductSymbolic_AtB;
1662: PetscFunctionReturn(PETSC_SUCCESS);
1663: }
1665: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1666: {
1667: PetscFunctionBegin;
1668: C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1669: C->ops->productsymbolic = MatProductSymbolic_ABt;
1670: PetscFunctionReturn(PETSC_SUCCESS);
1671: }
1673: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1674: {
1675: Mat_Product *product = C->product;
1677: PetscFunctionBegin;
1678: switch (product->type) {
1679: case MATPRODUCT_AB:
1680: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1681: break;
1682: case MATPRODUCT_AtB:
1683: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1684: break;
1685: case MATPRODUCT_ABt:
1686: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1687: break;
1688: default:
1689: break;
1690: }
1691: PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1694: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1695: {
1696: Mat_Product *product = C->product;
1697: Mat A = product->A;
1698: PetscBool baij;
1700: PetscFunctionBegin;
1701: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1702: if (!baij) { /* A is seqsbaij */
1703: PetscBool sbaij;
1704: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1705: PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
1707: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1708: } else { /* A is seqbaij */
1709: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1710: }
1712: C->ops->productsymbolic = MatProductSymbolic_AB;
1713: PetscFunctionReturn(PETSC_SUCCESS);
1714: }
1716: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1717: {
1718: Mat_Product *product = C->product;
1720: PetscFunctionBegin;
1721: MatCheckProduct(C, 1);
1722: PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1723: if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1724: PetscFunctionReturn(PETSC_SUCCESS);
1725: }
1727: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1728: {
1729: PetscFunctionBegin;
1730: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1731: C->ops->productsymbolic = MatProductSymbolic_AB;
1732: PetscFunctionReturn(PETSC_SUCCESS);
1733: }
1735: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1736: {
1737: Mat_Product *product = C->product;
1739: PetscFunctionBegin;
1740: if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1741: PetscFunctionReturn(PETSC_SUCCESS);
1742: }
1744: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1745: {
1746: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1747: Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1748: PetscInt *bi = b->i, *bj = b->j;
1749: PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1750: MatScalar *btval, *btval_den, *ba = b->a;
1751: PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1753: PetscFunctionBegin;
1754: btval_den = btdense->v;
1755: PetscCall(PetscArrayzero(btval_den, m * n));
1756: for (k = 0; k < ncolors; k++) {
1757: ncolumns = coloring->ncolumns[k];
1758: for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1759: col = *(columns + colorforcol[k] + l);
1760: btcol = bj + bi[col];
1761: btval = ba + bi[col];
1762: anz = bi[col + 1] - bi[col];
1763: for (j = 0; j < anz; j++) {
1764: brow = btcol[j];
1765: btval_den[brow] = btval[j];
1766: }
1767: }
1768: btval_den += m;
1769: }
1770: PetscFunctionReturn(PETSC_SUCCESS);
1771: }
1773: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1774: {
1775: Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data;
1776: const PetscScalar *ca_den, *ca_den_ptr;
1777: PetscScalar *ca = csp->a;
1778: PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1779: PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1780: PetscInt nrows, *row, *idx;
1781: PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
1783: PetscFunctionBegin;
1784: PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1786: if (brows > 0) {
1787: PetscInt *lstart, row_end, row_start;
1788: lstart = matcoloring->lstart;
1789: PetscCall(PetscArrayzero(lstart, ncolors));
1791: row_end = brows;
1792: if (row_end > m) row_end = m;
1793: for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1794: ca_den_ptr = ca_den;
1795: for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1796: nrows = matcoloring->nrows[k];
1797: row = rows + colorforrow[k];
1798: idx = den2sp + colorforrow[k];
1799: for (l = lstart[k]; l < nrows; l++) {
1800: if (row[l] >= row_end) {
1801: lstart[k] = l;
1802: break;
1803: } else {
1804: ca[idx[l]] = ca_den_ptr[row[l]];
1805: }
1806: }
1807: ca_den_ptr += m;
1808: }
1809: row_end += brows;
1810: if (row_end > m) row_end = m;
1811: }
1812: } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1813: ca_den_ptr = ca_den;
1814: for (k = 0; k < ncolors; k++) {
1815: nrows = matcoloring->nrows[k];
1816: row = rows + colorforrow[k];
1817: idx = den2sp + colorforrow[k];
1818: for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1819: ca_den_ptr += m;
1820: }
1821: }
1823: PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1824: #if defined(PETSC_USE_INFO)
1825: if (matcoloring->brows > 0) {
1826: PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1827: } else {
1828: PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1829: }
1830: #endif
1831: PetscFunctionReturn(PETSC_SUCCESS);
1832: }
1834: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1835: {
1836: PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1837: const PetscInt *is, *ci, *cj, *row_idx;
1838: PetscInt nis = iscoloring->n, *rowhit, bs = 1;
1839: IS *isa;
1840: Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data;
1841: PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1842: PetscInt *colorforcol, *columns, *columns_i, brows;
1843: PetscBool flg;
1845: PetscFunctionBegin;
1846: PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1848: /* bs >1 is not being tested yet! */
1849: Nbs = mat->cmap->N / bs;
1850: c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */
1851: c->N = Nbs;
1852: c->m = c->M;
1853: c->rstart = 0;
1854: c->brows = 100;
1856: c->ncolors = nis;
1857: PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1858: PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1859: PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1861: brows = c->brows;
1862: PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1863: if (flg) c->brows = brows;
1864: if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
1866: colorforrow[0] = 0;
1867: rows_i = rows;
1868: den2sp_i = den2sp;
1870: PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1871: PetscCall(PetscMalloc1(Nbs + 1, &columns));
1873: colorforcol[0] = 0;
1874: columns_i = columns;
1876: /* get column-wise storage of mat */
1877: PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1879: cm = c->m;
1880: PetscCall(PetscMalloc1(cm + 1, &rowhit));
1881: PetscCall(PetscMalloc1(cm + 1, &idxhit));
1882: for (i = 0; i < nis; i++) { /* loop over color */
1883: PetscCall(ISGetLocalSize(isa[i], &n));
1884: PetscCall(ISGetIndices(isa[i], &is));
1886: c->ncolumns[i] = n;
1887: if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1888: colorforcol[i + 1] = colorforcol[i] + n;
1889: columns_i += n;
1891: /* fast, crude version requires O(N*N) work */
1892: PetscCall(PetscArrayzero(rowhit, cm));
1894: for (j = 0; j < n; j++) { /* loop over columns*/
1895: col = is[j];
1896: row_idx = cj + ci[col];
1897: m = ci[col + 1] - ci[col];
1898: for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1899: idxhit[*row_idx] = spidx[ci[col] + k];
1900: rowhit[*row_idx++] = col + 1;
1901: }
1902: }
1903: /* count the number of hits */
1904: nrows = 0;
1905: for (j = 0; j < cm; j++) {
1906: if (rowhit[j]) nrows++;
1907: }
1908: c->nrows[i] = nrows;
1909: colorforrow[i + 1] = colorforrow[i] + nrows;
1911: nrows = 0;
1912: for (j = 0; j < cm; j++) { /* loop over rows */
1913: if (rowhit[j]) {
1914: rows_i[nrows] = j;
1915: den2sp_i[nrows] = idxhit[j];
1916: nrows++;
1917: }
1918: }
1919: den2sp_i += nrows;
1921: PetscCall(ISRestoreIndices(isa[i], &is));
1922: rows_i += nrows;
1923: }
1924: PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1925: PetscCall(PetscFree(rowhit));
1926: PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1927: PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1929: c->colorforrow = colorforrow;
1930: c->rows = rows;
1931: c->den2sp = den2sp;
1932: c->colorforcol = colorforcol;
1933: c->columns = columns;
1935: PetscCall(PetscFree(idxhit));
1936: PetscFunctionReturn(PETSC_SUCCESS);
1937: }
1939: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1940: {
1941: Mat_Product *product = C->product;
1942: Mat A = product->A, B = product->B;
1944: PetscFunctionBegin;
1945: if (C->ops->mattransposemultnumeric) {
1946: /* Alg: "outerproduct" */
1947: PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1948: } else {
1949: /* Alg: "matmatmult" -- C = At*B */
1950: Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1952: PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1953: if (atb->At) {
1954: /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1955: user may have called MatProductReplaceMats() to get this A=product->A */
1956: PetscCall(MatTransposeSetPrecursor(A, atb->At));
1957: PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1958: }
1959: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1960: }
1961: PetscFunctionReturn(PETSC_SUCCESS);
1962: }
1964: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1965: {
1966: Mat_Product *product = C->product;
1967: Mat A = product->A, B = product->B;
1968: PetscReal fill = product->fill;
1970: PetscFunctionBegin;
1971: PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
1973: C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1974: PetscFunctionReturn(PETSC_SUCCESS);
1975: }
1977: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1978: {
1979: Mat_Product *product = C->product;
1980: PetscInt alg = 0; /* default algorithm */
1981: PetscBool flg = PETSC_FALSE;
1982: #if !defined(PETSC_HAVE_HYPRE)
1983: const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1984: PetscInt nalg = 7;
1985: #else
1986: const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1987: PetscInt nalg = 8;
1988: #endif
1990: PetscFunctionBegin;
1991: /* Set default algorithm */
1992: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
1993: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
1995: /* Get runtime option */
1996: if (product->api_user) {
1997: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
1998: PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
1999: PetscOptionsEnd();
2000: } else {
2001: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2002: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2003: PetscOptionsEnd();
2004: }
2005: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2007: C->ops->productsymbolic = MatProductSymbolic_AB;
2008: C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2009: PetscFunctionReturn(PETSC_SUCCESS);
2010: }
2012: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2013: {
2014: Mat_Product *product = C->product;
2015: PetscInt alg = 0; /* default algorithm */
2016: PetscBool flg = PETSC_FALSE;
2017: const char *algTypes[3] = {"default", "at*b", "outerproduct"};
2018: PetscInt nalg = 3;
2020: PetscFunctionBegin;
2021: /* Get runtime option */
2022: if (product->api_user) {
2023: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2024: PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2025: PetscOptionsEnd();
2026: } else {
2027: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2028: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2029: PetscOptionsEnd();
2030: }
2031: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2033: C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2034: PetscFunctionReturn(PETSC_SUCCESS);
2035: }
2037: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2038: {
2039: Mat_Product *product = C->product;
2040: PetscInt alg = 0; /* default algorithm */
2041: PetscBool flg = PETSC_FALSE;
2042: const char *algTypes[2] = {"default", "color"};
2043: PetscInt nalg = 2;
2045: PetscFunctionBegin;
2046: /* Set default algorithm */
2047: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2048: if (!flg) {
2049: alg = 1;
2050: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2051: }
2053: /* Get runtime option */
2054: if (product->api_user) {
2055: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2056: PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2057: PetscOptionsEnd();
2058: } else {
2059: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2060: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2061: PetscOptionsEnd();
2062: }
2063: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2065: C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2066: C->ops->productsymbolic = MatProductSymbolic_ABt;
2067: PetscFunctionReturn(PETSC_SUCCESS);
2068: }
2070: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2071: {
2072: Mat_Product *product = C->product;
2073: PetscBool flg = PETSC_FALSE;
2074: PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */
2075: #if !defined(PETSC_HAVE_HYPRE)
2076: const char *algTypes[2] = {"scalable", "rap"};
2077: PetscInt nalg = 2;
2078: #else
2079: const char *algTypes[3] = {"scalable", "rap", "hypre"};
2080: PetscInt nalg = 3;
2081: #endif
2083: PetscFunctionBegin;
2084: /* Set default algorithm */
2085: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2086: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2088: /* Get runtime option */
2089: if (product->api_user) {
2090: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2091: PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2092: PetscOptionsEnd();
2093: } else {
2094: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2095: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2096: PetscOptionsEnd();
2097: }
2098: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2100: C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2101: PetscFunctionReturn(PETSC_SUCCESS);
2102: }
2104: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2105: {
2106: Mat_Product *product = C->product;
2107: PetscBool flg = PETSC_FALSE;
2108: PetscInt alg = 0; /* default algorithm */
2109: const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2110: PetscInt nalg = 3;
2112: PetscFunctionBegin;
2113: /* Set default algorithm */
2114: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2115: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2117: /* Get runtime option */
2118: if (product->api_user) {
2119: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2120: PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2121: PetscOptionsEnd();
2122: } else {
2123: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2124: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2125: PetscOptionsEnd();
2126: }
2127: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2129: C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2130: PetscFunctionReturn(PETSC_SUCCESS);
2131: }
2133: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2134: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2135: {
2136: Mat_Product *product = C->product;
2137: PetscInt alg = 0; /* default algorithm */
2138: PetscBool flg = PETSC_FALSE;
2139: const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2140: PetscInt nalg = 7;
2142: PetscFunctionBegin;
2143: /* Set default algorithm */
2144: PetscCall(PetscStrcmp(product->alg, "default", &flg));
2145: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2147: /* Get runtime option */
2148: if (product->api_user) {
2149: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2150: PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2151: PetscOptionsEnd();
2152: } else {
2153: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2154: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2155: PetscOptionsEnd();
2156: }
2157: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2159: C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2160: C->ops->productsymbolic = MatProductSymbolic_ABC;
2161: PetscFunctionReturn(PETSC_SUCCESS);
2162: }
2164: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2165: {
2166: Mat_Product *product = C->product;
2168: PetscFunctionBegin;
2169: switch (product->type) {
2170: case MATPRODUCT_AB:
2171: PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2172: break;
2173: case MATPRODUCT_AtB:
2174: PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2175: break;
2176: case MATPRODUCT_ABt:
2177: PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2178: break;
2179: case MATPRODUCT_PtAP:
2180: PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2181: break;
2182: case MATPRODUCT_RARt:
2183: PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2184: break;
2185: case MATPRODUCT_ABC:
2186: PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2187: break;
2188: default:
2189: break;
2190: }
2191: PetscFunctionReturn(PETSC_SUCCESS);
2192: }