Actual source code: mpimatmatmult.c
1: /*
2: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
3: C = A * B
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/utils/freespace.h>
7: #include <../src/mat/impls/aij/mpi/mpiaij.h>
8: #include <petscbt.h>
9: #include <../src/mat/impls/dense/mpi/mpidense.h>
10: #include <petsc/private/vecimpl.h>
11: #include <petsc/private/sfimpl.h>
13: #if defined(PETSC_HAVE_HYPRE)
14: PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat, Mat, PetscReal, Mat);
15: #endif
17: PETSC_INTERN PetscErrorCode MatProductSymbolic_ABt_MPIAIJ_MPIAIJ(Mat C)
18: {
19: Mat_Product *product = C->product;
20: Mat B = product->B;
22: PetscFunctionBegin;
23: PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &product->B));
24: PetscCall(MatDestroy(&B));
25: PetscCall(MatProductSymbolic_AB_MPIAIJ_MPIAIJ(C));
26: PetscFunctionReturn(PETSC_SUCCESS);
27: }
29: PETSC_INTERN PetscErrorCode MatProductSymbolic_AB_MPIAIJ_MPIAIJ(Mat C)
30: {
31: Mat_Product *product = C->product;
32: Mat A = product->A, B = product->B;
33: MatProductAlgorithm alg = product->alg;
34: PetscReal fill = product->fill;
35: PetscBool flg;
37: PetscFunctionBegin;
38: /* scalable */
39: PetscCall(PetscStrcmp(alg, "scalable", &flg));
40: if (flg) {
41: PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C));
42: PetscFunctionReturn(PETSC_SUCCESS);
43: }
45: /* nonscalable */
46: PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
47: if (flg) {
48: PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C));
49: PetscFunctionReturn(PETSC_SUCCESS);
50: }
52: /* seqmpi */
53: PetscCall(PetscStrcmp(alg, "seqmpi", &flg));
54: if (flg) {
55: PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(A, B, fill, C));
56: PetscFunctionReturn(PETSC_SUCCESS);
57: }
59: /* backend general code */
60: PetscCall(PetscStrcmp(alg, "backend", &flg));
61: if (flg) {
62: PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: #if defined(PETSC_HAVE_HYPRE)
67: PetscCall(PetscStrcmp(alg, "hypre", &flg));
68: if (flg) {
69: PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
70: PetscFunctionReturn(PETSC_SUCCESS);
71: }
72: #endif
73: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
74: }
76: PetscErrorCode MatProductCtxDestroy_MPIAIJ_MatMatMult(void **data)
77: {
78: MatProductCtx_APMPI *ptap = *(MatProductCtx_APMPI **)data;
80: PetscFunctionBegin;
81: PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
82: PetscCall(PetscFree(ptap->bufa));
83: PetscCall(MatDestroy(&ptap->P_loc));
84: PetscCall(MatDestroy(&ptap->P_oth));
85: PetscCall(MatDestroy(&ptap->Pt));
86: PetscCall(PetscFree(ptap->api));
87: PetscCall(PetscFree(ptap->apj));
88: PetscCall(PetscFree(ptap->apa));
89: PetscCall(PetscFree(ptap));
90: PetscFunctionReturn(PETSC_SUCCESS);
91: }
93: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, Mat C)
94: {
95: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data;
96: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
97: Mat_SeqAIJ *cd = (Mat_SeqAIJ *)c->A->data, *co = (Mat_SeqAIJ *)c->B->data;
98: PetscScalar *cda = cd->a, *coa = co->a;
99: Mat_SeqAIJ *p_loc, *p_oth;
100: PetscScalar *apa, *ca;
101: PetscInt cm = C->rmap->n;
102: MatProductCtx_APMPI *ptap;
103: PetscInt *api, *apj, *apJ, i, k;
104: PetscInt cstart = C->cmap->rstart;
105: PetscInt cdnz, conz, k0, k1;
106: const PetscScalar *dummy;
107: MPI_Comm comm;
108: PetscMPIInt size;
110: PetscFunctionBegin;
111: MatCheckProduct(C, 3);
112: ptap = (MatProductCtx_APMPI *)C->product->data;
113: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
114: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
115: PetscCallMPI(MPI_Comm_size(comm, &size));
116: PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()");
118: /* flag CPU mask for C */
119: #if defined(PETSC_HAVE_DEVICE)
120: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
121: if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU;
122: if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU;
123: #endif
125: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
126: /* update numerical values of P_oth and P_loc */
127: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
128: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
130: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
131: /* get data from symbolic products */
132: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
133: p_oth = NULL;
134: if (size > 1) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
136: /* get apa for storing dense row A[i,:]*P */
137: apa = ptap->apa;
139: api = ptap->api;
140: apj = ptap->apj;
141: /* trigger copy to CPU */
142: PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy));
143: PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy));
144: PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy));
145: PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy));
146: for (i = 0; i < cm; i++) {
147: /* compute apa = A[i,:]*P */
148: AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
150: /* set values in C */
151: apJ = PetscSafePointerPlusOffset(apj, api[i]);
152: cdnz = cd->i[i + 1] - cd->i[i];
153: conz = co->i[i + 1] - co->i[i];
155: /* 1st off-diagonal part of C */
156: ca = PetscSafePointerPlusOffset(coa, co->i[i]);
157: k = 0;
158: for (k0 = 0; k0 < conz; k0++) {
159: if (apJ[k] >= cstart) break;
160: ca[k0] = apa[apJ[k]];
161: apa[apJ[k++]] = 0.0;
162: }
164: /* diagonal part of C */
165: ca = PetscSafePointerPlusOffset(cda, cd->i[i]);
166: for (k1 = 0; k1 < cdnz; k1++) {
167: ca[k1] = apa[apJ[k]];
168: apa[apJ[k++]] = 0.0;
169: }
171: /* 2nd off-diagonal part of C */
172: ca = PetscSafePointerPlusOffset(coa, co->i[i]);
173: for (; k0 < conz; k0++) {
174: ca[k0] = apa[apJ[k]];
175: apa[apJ[k++]] = 0.0;
176: }
177: }
178: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
179: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, PetscReal fill, Mat C)
184: {
185: MPI_Comm comm;
186: PetscMPIInt size;
187: MatProductCtx_APMPI *ptap;
188: PetscFreeSpaceList free_space = NULL, current_space = NULL;
189: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
190: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_loc, *p_oth;
191: PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz;
192: PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart;
193: PetscInt *lnk, i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi;
194: PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n;
195: PetscBT lnkbt;
196: PetscReal afill;
197: MatType mtype;
199: PetscFunctionBegin;
200: MatCheckProduct(C, 4);
201: PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
202: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
203: PetscCallMPI(MPI_Comm_size(comm, &size));
205: /* create struct MatProductCtx_APMPI and attached it to C later */
206: PetscCall(PetscNew(&ptap));
208: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
209: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
211: /* get P_loc by taking all local rows of P */
212: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
214: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
215: pi_loc = p_loc->i;
216: pj_loc = p_loc->j;
217: if (size > 1) {
218: p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
219: pi_oth = p_oth->i;
220: pj_oth = p_oth->j;
221: } else {
222: p_oth = NULL;
223: pi_oth = NULL;
224: pj_oth = NULL;
225: }
227: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
228: PetscCall(PetscMalloc1(am + 1, &api));
229: ptap->api = api;
230: api[0] = 0;
232: /* create and initialize a linked list */
233: PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt));
235: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
236: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space));
237: current_space = free_space;
239: MatPreallocateBegin(comm, am, pn, dnz, onz);
240: for (i = 0; i < am; i++) {
241: /* diagonal portion of A */
242: nzi = adi[i + 1] - adi[i];
243: for (j = 0; j < nzi; j++) {
244: row = *adj++;
245: pnz = pi_loc[row + 1] - pi_loc[row];
246: Jptr = pj_loc + pi_loc[row];
247: /* add non-zero cols of P into the sorted linked list lnk */
248: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
249: }
250: /* off-diagonal portion of A */
251: nzi = aoi[i + 1] - aoi[i];
252: for (j = 0; j < nzi; j++) {
253: row = *aoj++;
254: pnz = pi_oth[row + 1] - pi_oth[row];
255: Jptr = pj_oth + pi_oth[row];
256: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
257: }
258: /* add possible missing diagonal entry */
259: if (C->force_diagonals) {
260: j = i + rstart; /* column index */
261: PetscCall(PetscLLCondensedAddSorted(1, &j, lnk, lnkbt));
262: }
264: apnz = lnk[0];
265: api[i + 1] = api[i] + apnz;
267: /* if free space is not available, double the total space in the list */
268: if (current_space->local_remaining < apnz) {
269: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
270: nspacedouble++;
271: }
273: /* Copy data into free space, then initialize lnk */
274: PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
275: PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz));
277: current_space->array += apnz;
278: current_space->local_used += apnz;
279: current_space->local_remaining -= apnz;
280: }
282: /* Allocate space for apj, initialize apj, and */
283: /* destroy list of free space and other temporary array(s) */
284: PetscCall(PetscMalloc1(api[am], &ptap->apj));
285: apj = ptap->apj;
286: PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj));
287: PetscCall(PetscLLDestroy(lnk, lnkbt));
289: /* malloc apa to store dense row A[i,:]*P */
290: PetscCall(PetscCalloc1(pN, &ptap->apa));
292: /* set and assemble symbolic parallel matrix C */
293: PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE));
294: PetscCall(MatSetBlockSizesFromMats(C, A, P));
296: PetscCall(MatGetType(A, &mtype));
297: PetscCall(MatSetType(C, mtype));
298: PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz));
299: MatPreallocateEnd(dnz, onz);
301: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api));
302: PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
303: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
304: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
305: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
307: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
308: C->ops->productnumeric = MatProductNumeric_AB;
310: /* attach the supporting struct to C for reuse */
311: C->product->data = ptap;
312: C->product->destroy = MatProductCtxDestroy_MPIAIJ_MatMatMult;
314: /* set MatInfo */
315: afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5;
316: if (afill < 1.0) afill = 1.0;
317: C->info.mallocs = nspacedouble;
318: C->info.fill_ratio_given = fill;
319: C->info.fill_ratio_needed = afill;
321: #if defined(PETSC_USE_INFO)
322: if (api[am]) {
323: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill));
324: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
325: } else {
326: PetscCall(PetscInfo(C, "Empty matrix product\n"));
327: }
328: #endif
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat, Mat, PetscReal, Mat);
333: static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat, Mat, Mat);
335: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AB(Mat C)
336: {
337: Mat_Product *product = C->product;
338: Mat A = product->A, B = product->B;
340: PetscFunctionBegin;
341: if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend)
342: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
344: C->ops->matmultsymbolic = MatMatMultSymbolic_MPIAIJ_MPIDense;
345: C->ops->productsymbolic = MatProductSymbolic_AB;
346: PetscFunctionReturn(PETSC_SUCCESS);
347: }
349: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(Mat C)
350: {
351: Mat_Product *product = C->product;
352: Mat A = product->A, B = product->B;
354: PetscFunctionBegin;
355: if (A->rmap->rstart != B->rmap->rstart || A->rmap->rend != B->rmap->rend)
356: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
358: C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIAIJ_MPIDense;
359: C->ops->productsymbolic = MatProductSymbolic_AtB;
360: PetscFunctionReturn(PETSC_SUCCESS);
361: }
363: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense(Mat C)
364: {
365: Mat_Product *product = C->product;
367: PetscFunctionBegin;
368: switch (product->type) {
369: case MATPRODUCT_AB:
370: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AB(C));
371: break;
372: case MATPRODUCT_AtB:
373: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(C));
374: break;
375: default:
376: break;
377: }
378: PetscFunctionReturn(PETSC_SUCCESS);
379: }
381: typedef struct {
382: Mat workB, workB1;
383: MPI_Request *rwaits, *swaits;
384: PetscInt nsends, nrecvs;
385: MPI_Datatype *stype, *rtype;
386: PetscInt blda;
387: } MPIAIJ_MPIDense;
389: static PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void **ctx)
390: {
391: MPIAIJ_MPIDense *contents = *(MPIAIJ_MPIDense **)ctx;
392: PetscInt i;
394: PetscFunctionBegin;
395: PetscCall(MatDestroy(&contents->workB));
396: PetscCall(MatDestroy(&contents->workB1));
397: for (i = 0; i < contents->nsends; i++) PetscCallMPI(MPI_Type_free(&contents->stype[i]));
398: for (i = 0; i < contents->nrecvs; i++) PetscCallMPI(MPI_Type_free(&contents->rtype[i]));
399: PetscCall(PetscFree4(contents->stype, contents->rtype, contents->rwaits, contents->swaits));
400: PetscCall(PetscFree(contents));
401: PetscFunctionReturn(PETSC_SUCCESS);
402: }
404: static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
405: {
406: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
407: PetscInt nz = aij->B->cmap->n, blda, m, M, n, N;
408: MPIAIJ_MPIDense *contents;
409: VecScatter ctx = aij->Mvctx;
410: PetscInt Am = A->rmap->n, Bm = B->rmap->n, BN = B->cmap->N, Bbn, Bbn1, bs, numBb;
411: MPI_Comm comm;
412: MPI_Datatype type1, *stype, *rtype;
413: const PetscInt *sindices, *sstarts, *rstarts;
414: PetscMPIInt *disp, nsends, nrecvs, nrows_to, nrows_from;
415: PetscBool cisdense;
417: PetscFunctionBegin;
418: MatCheckProduct(C, 4);
419: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
420: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
421: PetscCall(PetscObjectBaseTypeCompare((PetscObject)C, MATMPIDENSE, &cisdense));
422: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)B)->type_name));
423: PetscCall(MatGetLocalSize(C, &m, &n));
424: PetscCall(MatGetSize(C, &M, &N));
425: if (m == PETSC_DECIDE || n == PETSC_DECIDE || M == PETSC_DECIDE || N == PETSC_DECIDE) PetscCall(MatSetSizes(C, Am, B->cmap->n, A->rmap->N, BN));
426: PetscCall(MatSetBlockSizesFromMats(C, A, B));
427: PetscCall(MatSetUp(C));
428: PetscCall(MatDenseGetLDA(B, &blda));
429: PetscCall(PetscNew(&contents));
431: PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL));
432: PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL));
434: /* Create column block of B and C for memory scalability when BN is too large */
435: /* Estimate Bbn, column size of Bb */
436: if (nz) {
437: Bbn1 = 2 * Am * BN / nz;
438: if (!Bbn1) Bbn1 = 1;
439: } else Bbn1 = BN;
441: bs = B->cmap->bs;
442: Bbn1 = Bbn1 / bs * bs; /* Bbn1 is a multiple of bs */
443: if (Bbn1 > BN) Bbn1 = BN;
444: PetscCallMPI(MPIU_Allreduce(&Bbn1, &Bbn, 1, MPIU_INT, MPI_MAX, comm));
446: /* Enable runtime option for Bbn */
447: PetscOptionsBegin(comm, ((PetscObject)C)->prefix, "MatMatMult", "Mat");
448: PetscCall(PetscOptionsInt("-matmatmult_Bbn", "Number of columns in Bb", "MatMatMult", Bbn, &Bbn, NULL));
449: PetscOptionsEnd();
450: Bbn = PetscMin(Bbn, BN);
452: if (Bbn > 0 && Bbn < BN) {
453: numBb = BN / Bbn;
454: Bbn1 = BN - numBb * Bbn;
455: } else numBb = 0;
457: if (numBb) {
458: PetscCall(PetscInfo(C, "use Bb, BN=%" PetscInt_FMT ", Bbn=%" PetscInt_FMT "; numBb=%" PetscInt_FMT "\n", BN, Bbn, numBb));
459: if (Bbn1) { /* Create workB1 for the remaining columns */
460: PetscCall(PetscInfo(C, "use Bb1, BN=%" PetscInt_FMT ", Bbn1=%" PetscInt_FMT "\n", BN, Bbn1));
461: /* Create work matrix used to store off processor rows of B needed for local product */
462: PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn1, NULL, &contents->workB1));
463: } else contents->workB1 = NULL;
464: }
466: /* Create work matrix used to store off processor rows of B needed for local product */
467: PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn, NULL, &contents->workB));
469: /* Use MPI derived data type to reduce memory required by the send/recv buffers */
470: PetscCall(PetscMalloc4(nsends, &stype, nrecvs, &rtype, nrecvs, &contents->rwaits, nsends, &contents->swaits));
471: contents->stype = stype;
472: contents->nsends = nsends;
474: contents->rtype = rtype;
475: contents->nrecvs = nrecvs;
476: contents->blda = blda;
478: PetscCall(PetscMalloc1(Bm + 1, &disp));
479: for (PetscMPIInt i = 0; i < nsends; i++) {
480: PetscCall(PetscMPIIntCast(sstarts[i + 1] - sstarts[i], &nrows_to));
481: for (PetscInt j = 0; j < nrows_to; j++) PetscCall(PetscMPIIntCast(sindices[sstarts[i] + j], &disp[j])); /* rowB to be sent */
482: PetscCallMPI(MPI_Type_create_indexed_block(nrows_to, 1, disp, MPIU_SCALAR, &type1));
483: PetscCallMPI(MPI_Type_create_resized(type1, 0, blda * sizeof(PetscScalar), &stype[i]));
484: PetscCallMPI(MPI_Type_commit(&stype[i]));
485: PetscCallMPI(MPI_Type_free(&type1));
486: }
488: for (PetscMPIInt i = 0; i < nrecvs; i++) {
489: /* received values from a process form a (nrows_from x Bbn) row block in workB (column-wise) */
490: PetscCall(PetscMPIIntCast(rstarts[i + 1] - rstarts[i], &nrows_from));
491: disp[0] = 0;
492: PetscCallMPI(MPI_Type_create_indexed_block(1, nrows_from, disp, MPIU_SCALAR, &type1));
493: PetscCallMPI(MPI_Type_create_resized(type1, 0, nz * sizeof(PetscScalar), &rtype[i]));
494: PetscCallMPI(MPI_Type_commit(&rtype[i]));
495: PetscCallMPI(MPI_Type_free(&type1));
496: }
498: PetscCall(PetscFree(disp));
499: PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL));
500: PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL));
501: PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
502: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
503: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
505: C->product->data = contents;
506: C->product->destroy = MatMPIAIJ_MPIDenseDestroy;
507: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
508: PetscFunctionReturn(PETSC_SUCCESS);
509: }
511: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat, Mat, Mat, const PetscBool);
513: /*
514: Performs an efficient scatter on the rows of B needed by this process; this is
515: a modification of the VecScatterBegin_() routines.
517: Input: If Bbidx = 0, uses B = Bb, else B = Bb1, see MatMatMultSymbolic_MPIAIJ_MPIDense()
518: */
520: static PetscErrorCode MatMPIDenseScatter(Mat A, Mat B, PetscInt Bbidx, Mat C, Mat *outworkB)
521: {
522: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
523: const PetscScalar *b;
524: PetscScalar *rvalues;
525: VecScatter ctx = aij->Mvctx;
526: const PetscInt *sindices, *sstarts, *rstarts;
527: const PetscMPIInt *sprocs, *rprocs;
528: PetscMPIInt nsends, nrecvs;
529: MPI_Request *swaits, *rwaits;
530: MPI_Comm comm;
531: PetscMPIInt tag = ((PetscObject)ctx)->tag, ncols, nrows, nsends_mpi, nrecvs_mpi;
532: MPIAIJ_MPIDense *contents;
533: Mat workB;
534: MPI_Datatype *stype, *rtype;
535: PetscInt blda;
537: PetscFunctionBegin;
538: MatCheckProduct(C, 4);
539: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
540: PetscCall(PetscMPIIntCast(B->cmap->N, &ncols));
541: PetscCall(PetscMPIIntCast(aij->B->cmap->n, &nrows));
542: contents = (MPIAIJ_MPIDense *)C->product->data;
543: PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL /*bs*/));
544: PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL /*bs*/));
545: PetscCall(PetscMPIIntCast(nsends, &nsends_mpi));
546: PetscCall(PetscMPIIntCast(nrecvs, &nrecvs_mpi));
547: if (Bbidx == 0) workB = *outworkB = contents->workB;
548: else workB = *outworkB = contents->workB1;
549: PetscCheck(nrows == workB->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of rows of workB %" PetscInt_FMT " not equal to columns of aij->B %d", workB->cmap->n, nrows);
550: swaits = contents->swaits;
551: rwaits = contents->rwaits;
553: PetscCall(MatDenseGetArrayRead(B, &b));
554: PetscCall(MatDenseGetLDA(B, &blda));
555: PetscCheck(blda == contents->blda, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot reuse an input matrix with lda %" PetscInt_FMT " != %" PetscInt_FMT, blda, contents->blda);
556: PetscCall(MatDenseGetArray(workB, &rvalues));
558: /* Post recv, use MPI derived data type to save memory */
559: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
560: rtype = contents->rtype;
561: for (PetscMPIInt i = 0; i < nrecvs; i++) PetscCallMPI(MPIU_Irecv(rvalues + (rstarts[i] - rstarts[0]), ncols, rtype[i], rprocs[i], tag, comm, rwaits + i));
563: stype = contents->stype;
564: for (PetscMPIInt i = 0; i < nsends; i++) PetscCallMPI(MPIU_Isend(b, ncols, stype[i], sprocs[i], tag, comm, swaits + i));
566: if (nrecvs) PetscCallMPI(MPI_Waitall(nrecvs_mpi, rwaits, MPI_STATUSES_IGNORE));
567: if (nsends) PetscCallMPI(MPI_Waitall(nsends_mpi, swaits, MPI_STATUSES_IGNORE));
569: PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL));
570: PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL));
571: PetscCall(MatDenseRestoreArrayRead(B, &b));
572: PetscCall(MatDenseRestoreArray(workB, &rvalues));
573: PetscFunctionReturn(PETSC_SUCCESS);
574: }
576: static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A, Mat B, Mat C)
577: {
578: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
579: Mat_MPIDense *bdense = (Mat_MPIDense *)B->data;
580: Mat_MPIDense *cdense = (Mat_MPIDense *)C->data;
581: Mat workB;
582: MPIAIJ_MPIDense *contents;
584: PetscFunctionBegin;
585: MatCheckProduct(C, 3);
586: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
587: contents = (MPIAIJ_MPIDense *)C->product->data;
588: /* diagonal block of A times all local rows of B */
589: /* TODO: this calls a symbolic multiplication every time, which could be avoided */
590: PetscCall(MatMatMult(aij->A, bdense->A, MAT_REUSE_MATRIX, PETSC_CURRENT, &cdense->A));
591: if (contents->workB->cmap->n == B->cmap->N) {
592: /* get off processor parts of B needed to complete C=A*B */
593: PetscCall(MatMPIDenseScatter(A, B, 0, C, &workB));
595: /* off-diagonal block of A times nonlocal rows of B */
596: PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE));
597: } else {
598: Mat Bb, Cb;
599: PetscInt BN = B->cmap->N, n = contents->workB->cmap->n;
600: PetscBool ccpu;
602: PetscCheck(n > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Column block size %" PetscInt_FMT " must be positive", n);
603: /* Prevent from unneeded copies back and forth from the GPU
604: when getting and restoring the submatrix
605: We need a proper GPU code for AIJ * dense in parallel */
606: PetscCall(MatBoundToCPU(C, &ccpu));
607: PetscCall(MatBindToCPU(C, PETSC_TRUE));
608: for (PetscInt i = 0; i < BN; i += n) {
609: PetscCall(MatDenseGetSubMatrix(B, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Bb));
610: PetscCall(MatDenseGetSubMatrix(C, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Cb));
612: /* get off processor parts of B needed to complete C=A*B */
613: PetscCall(MatMPIDenseScatter(A, Bb, (i + n) > BN, C, &workB));
615: /* off-diagonal block of A times nonlocal rows of B */
616: cdense = (Mat_MPIDense *)Cb->data;
617: PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE));
618: PetscCall(MatDenseRestoreSubMatrix(B, &Bb));
619: PetscCall(MatDenseRestoreSubMatrix(C, &Cb));
620: }
621: PetscCall(MatBindToCPU(C, ccpu));
622: }
623: PetscFunctionReturn(PETSC_SUCCESS);
624: }
626: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
627: {
628: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data;
629: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
630: Mat_SeqAIJ *cd = (Mat_SeqAIJ *)c->A->data, *co = (Mat_SeqAIJ *)c->B->data;
631: PetscInt *adi = ad->i, *adj, *aoi = ao->i, *aoj;
632: PetscScalar *ada, *aoa, *cda = cd->a, *coa = co->a;
633: Mat_SeqAIJ *p_loc, *p_oth;
634: PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *pj;
635: PetscScalar *pa_loc, *pa_oth, *pa, valtmp, *ca;
636: PetscInt cm = C->rmap->n, anz, pnz;
637: MatProductCtx_APMPI *ptap;
638: PetscScalar *apa_sparse;
639: const PetscScalar *dummy;
640: PetscInt *api, *apj, *apJ, i, j, k, row;
641: PetscInt cstart = C->cmap->rstart;
642: PetscInt cdnz, conz, k0, k1, nextp;
643: MPI_Comm comm;
644: PetscMPIInt size;
646: PetscFunctionBegin;
647: MatCheckProduct(C, 3);
648: ptap = (MatProductCtx_APMPI *)C->product->data;
649: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
650: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
651: PetscCallMPI(MPI_Comm_size(comm, &size));
652: PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()");
654: /* flag CPU mask for C */
655: #if defined(PETSC_HAVE_DEVICE)
656: if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
657: if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU;
658: if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU;
659: #endif
660: apa_sparse = ptap->apa;
662: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
663: /* update numerical values of P_oth and P_loc */
664: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
665: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
667: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
668: /* get data from symbolic products */
669: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
670: pi_loc = p_loc->i;
671: pj_loc = p_loc->j;
672: pa_loc = p_loc->a;
673: if (size > 1) {
674: p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
675: pi_oth = p_oth->i;
676: pj_oth = p_oth->j;
677: pa_oth = p_oth->a;
678: } else {
679: p_oth = NULL;
680: pi_oth = NULL;
681: pj_oth = NULL;
682: pa_oth = NULL;
683: }
685: /* trigger copy to CPU */
686: PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy));
687: PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy));
688: PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy));
689: PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy));
690: api = ptap->api;
691: apj = ptap->apj;
692: for (i = 0; i < cm; i++) {
693: apJ = apj + api[i];
695: /* diagonal portion of A */
696: anz = adi[i + 1] - adi[i];
697: adj = ad->j + adi[i];
698: ada = ad->a + adi[i];
699: for (j = 0; j < anz; j++) {
700: row = adj[j];
701: pnz = pi_loc[row + 1] - pi_loc[row];
702: pj = pj_loc + pi_loc[row];
703: pa = pa_loc + pi_loc[row];
704: /* perform sparse axpy */
705: valtmp = ada[j];
706: nextp = 0;
707: for (k = 0; nextp < pnz; k++) {
708: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
709: apa_sparse[k] += valtmp * pa[nextp++];
710: }
711: }
712: PetscCall(PetscLogFlops(2.0 * pnz));
713: }
715: /* off-diagonal portion of A */
716: anz = aoi[i + 1] - aoi[i];
717: aoj = PetscSafePointerPlusOffset(ao->j, aoi[i]);
718: aoa = PetscSafePointerPlusOffset(ao->a, aoi[i]);
719: for (j = 0; j < anz; j++) {
720: row = aoj[j];
721: pnz = pi_oth[row + 1] - pi_oth[row];
722: pj = pj_oth + pi_oth[row];
723: pa = pa_oth + pi_oth[row];
724: /* perform sparse axpy */
725: valtmp = aoa[j];
726: nextp = 0;
727: for (k = 0; nextp < pnz; k++) {
728: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
729: apa_sparse[k] += valtmp * pa[nextp++];
730: }
731: }
732: PetscCall(PetscLogFlops(2.0 * pnz));
733: }
735: /* set values in C */
736: cdnz = cd->i[i + 1] - cd->i[i];
737: conz = co->i[i + 1] - co->i[i];
739: /* 1st off-diagonal part of C */
740: ca = PetscSafePointerPlusOffset(coa, co->i[i]);
741: k = 0;
742: for (k0 = 0; k0 < conz; k0++) {
743: if (apJ[k] >= cstart) break;
744: ca[k0] = apa_sparse[k];
745: apa_sparse[k] = 0.0;
746: k++;
747: }
749: /* diagonal part of C */
750: ca = cda + cd->i[i];
751: for (k1 = 0; k1 < cdnz; k1++) {
752: ca[k1] = apa_sparse[k];
753: apa_sparse[k] = 0.0;
754: k++;
755: }
757: /* 2nd off-diagonal part of C */
758: ca = PetscSafePointerPlusOffset(coa, co->i[i]);
759: for (; k0 < conz; k0++) {
760: ca[k0] = apa_sparse[k];
761: apa_sparse[k] = 0.0;
762: k++;
763: }
764: }
765: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
766: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
767: PetscFunctionReturn(PETSC_SUCCESS);
768: }
770: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
771: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat C)
772: {
773: MPI_Comm comm;
774: PetscMPIInt size;
775: MatProductCtx_APMPI *ptap;
776: PetscFreeSpaceList free_space = NULL, current_space = NULL;
777: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
778: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_loc, *p_oth;
779: PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz;
780: PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart;
781: PetscInt i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi, *lnk, apnz_max = 1;
782: PetscInt am = A->rmap->n, pn = P->cmap->n, pm = P->rmap->n, lsize = pn + 20;
783: PetscReal afill;
784: MatType mtype;
786: PetscFunctionBegin;
787: MatCheckProduct(C, 4);
788: PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
789: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
790: PetscCallMPI(MPI_Comm_size(comm, &size));
792: /* create struct MatProductCtx_APMPI and attached it to C later */
793: PetscCall(PetscNew(&ptap));
795: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
796: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
798: /* get P_loc by taking all local rows of P */
799: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
801: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
802: pi_loc = p_loc->i;
803: pj_loc = p_loc->j;
804: if (size > 1) {
805: p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
806: pi_oth = p_oth->i;
807: pj_oth = p_oth->j;
808: } else {
809: p_oth = NULL;
810: pi_oth = NULL;
811: pj_oth = NULL;
812: }
814: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
815: PetscCall(PetscMalloc1(am + 1, &api));
816: ptap->api = api;
817: api[0] = 0;
819: PetscCall(PetscLLCondensedCreate_Scalable(lsize, &lnk));
821: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
822: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space));
823: current_space = free_space;
824: MatPreallocateBegin(comm, am, pn, dnz, onz);
825: for (i = 0; i < am; i++) {
826: /* diagonal portion of A */
827: nzi = adi[i + 1] - adi[i];
828: for (j = 0; j < nzi; j++) {
829: row = *adj++;
830: pnz = pi_loc[row + 1] - pi_loc[row];
831: Jptr = pj_loc + pi_loc[row];
832: /* Expand list if it is not long enough */
833: if (pnz + apnz_max > lsize) {
834: lsize = pnz + apnz_max;
835: PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk));
836: }
837: /* add non-zero cols of P into the sorted linked list lnk */
838: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
839: apnz = *lnk; /* The first element in the list is the number of items in the list */
840: api[i + 1] = api[i] + apnz;
841: if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */
842: }
843: /* off-diagonal portion of A */
844: nzi = aoi[i + 1] - aoi[i];
845: for (j = 0; j < nzi; j++) {
846: row = *aoj++;
847: pnz = pi_oth[row + 1] - pi_oth[row];
848: Jptr = pj_oth + pi_oth[row];
849: /* Expand list if it is not long enough */
850: if (pnz + apnz_max > lsize) {
851: lsize = pnz + apnz_max;
852: PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk));
853: }
854: /* add non-zero cols of P into the sorted linked list lnk */
855: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
856: apnz = *lnk; /* The first element in the list is the number of items in the list */
857: api[i + 1] = api[i] + apnz;
858: if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */
859: }
861: /* add missing diagonal entry */
862: if (C->force_diagonals) {
863: j = i + rstart; /* column index */
864: PetscCall(PetscLLCondensedAddSorted_Scalable(1, &j, lnk));
865: }
867: apnz = *lnk;
868: api[i + 1] = api[i] + apnz;
869: if (apnz > apnz_max) apnz_max = apnz;
871: /* if free space is not available, double the total space in the list */
872: if (current_space->local_remaining < apnz) {
873: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
874: nspacedouble++;
875: }
877: /* Copy data into free space, then initialize lnk */
878: PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
879: PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz));
881: current_space->array += apnz;
882: current_space->local_used += apnz;
883: current_space->local_remaining -= apnz;
884: }
886: /* Allocate space for apj, initialize apj, and */
887: /* destroy list of free space and other temporary array(s) */
888: PetscCall(PetscMalloc1(api[am], &ptap->apj));
889: apj = ptap->apj;
890: PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj));
891: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
893: /* create and assemble symbolic parallel matrix C */
894: PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE));
895: PetscCall(MatSetBlockSizesFromMats(C, A, P));
896: PetscCall(MatGetType(A, &mtype));
897: PetscCall(MatSetType(C, mtype));
898: PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz));
899: MatPreallocateEnd(dnz, onz);
901: /* malloc apa for assembly C */
902: PetscCall(PetscCalloc1(apnz_max, &ptap->apa));
904: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api));
905: PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
906: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
907: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
908: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
910: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
911: C->ops->productnumeric = MatProductNumeric_AB;
913: /* attach the supporting struct to C for reuse */
914: C->product->data = ptap;
915: C->product->destroy = MatProductCtxDestroy_MPIAIJ_MatMatMult;
917: /* set MatInfo */
918: afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5;
919: if (afill < 1.0) afill = 1.0;
920: C->info.mallocs = nspacedouble;
921: C->info.fill_ratio_given = fill;
922: C->info.fill_ratio_needed = afill;
924: #if defined(PETSC_USE_INFO)
925: if (api[am]) {
926: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill));
927: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
928: } else {
929: PetscCall(PetscInfo(C, "Empty matrix product\n"));
930: }
931: #endif
932: PetscFunctionReturn(PETSC_SUCCESS);
933: }
935: /* This function is needed for the seqMPI matrix-matrix multiplication. */
936: /* Three input arrays are merged to one output array. The size of the */
937: /* output array is also output. Duplicate entries only show up once. */
938: static void Merge3SortedArrays(PetscInt size1, PetscInt *in1, PetscInt size2, PetscInt *in2, PetscInt size3, PetscInt *in3, PetscInt *size4, PetscInt *out)
939: {
940: int i = 0, j = 0, k = 0, l = 0;
942: /* Traverse all three arrays */
943: while (i < size1 && j < size2 && k < size3) {
944: if (in1[i] < in2[j] && in1[i] < in3[k]) {
945: out[l++] = in1[i++];
946: } else if (in2[j] < in1[i] && in2[j] < in3[k]) {
947: out[l++] = in2[j++];
948: } else if (in3[k] < in1[i] && in3[k] < in2[j]) {
949: out[l++] = in3[k++];
950: } else if (in1[i] == in2[j] && in1[i] < in3[k]) {
951: out[l++] = in1[i];
952: i++, j++;
953: } else if (in1[i] == in3[k] && in1[i] < in2[j]) {
954: out[l++] = in1[i];
955: i++, k++;
956: } else if (in3[k] == in2[j] && in2[j] < in1[i]) {
957: out[l++] = in2[j];
958: k++, j++;
959: } else if (in1[i] == in2[j] && in1[i] == in3[k]) {
960: out[l++] = in1[i];
961: i++, j++, k++;
962: }
963: }
965: /* Traverse two remaining arrays */
966: while (i < size1 && j < size2) {
967: if (in1[i] < in2[j]) {
968: out[l++] = in1[i++];
969: } else if (in1[i] > in2[j]) {
970: out[l++] = in2[j++];
971: } else {
972: out[l++] = in1[i];
973: i++, j++;
974: }
975: }
977: while (i < size1 && k < size3) {
978: if (in1[i] < in3[k]) {
979: out[l++] = in1[i++];
980: } else if (in1[i] > in3[k]) {
981: out[l++] = in3[k++];
982: } else {
983: out[l++] = in1[i];
984: i++, k++;
985: }
986: }
988: while (k < size3 && j < size2) {
989: if (in3[k] < in2[j]) {
990: out[l++] = in3[k++];
991: } else if (in3[k] > in2[j]) {
992: out[l++] = in2[j++];
993: } else {
994: out[l++] = in3[k];
995: k++, j++;
996: }
997: }
999: /* Traverse one remaining array */
1000: while (i < size1) out[l++] = in1[i++];
1001: while (j < size2) out[l++] = in2[j++];
1002: while (k < size3) out[l++] = in3[k++];
1004: *size4 = l;
1005: }
1007: /* This matrix-matrix multiplication algorithm divides the multiplication into three multiplications and */
1008: /* adds up the products. Two of these three multiplications are performed with existing (sequential) */
1009: /* matrix-matrix multiplications. */
1010: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(Mat A, Mat P, PetscReal fill, Mat C)
1011: {
1012: MPI_Comm comm;
1013: PetscMPIInt size;
1014: MatProductCtx_APMPI *ptap;
1015: PetscFreeSpaceList free_space_diag = NULL, current_space = NULL;
1016: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1017: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_loc;
1018: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1019: Mat_SeqAIJ *adpd_seq, *p_off, *aopoth_seq;
1020: PetscInt adponz, adpdnz;
1021: PetscInt *pi_loc, *dnz, *onz;
1022: PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, rstart = A->rmap->rstart;
1023: PetscInt *lnk, i, i1 = 0, pnz, row, *adpoi, *adpoj, *api, *adpoJ, *aopJ, *apJ, *Jptr, aopnz, nspacedouble = 0, j, nzi, *apj, apnz, *adpdi, *adpdj, *adpdJ, *poff_i, *poff_j, *j_temp, *aopothi, *aopothj;
1024: PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n, p_colstart, p_colend;
1025: PetscBT lnkbt;
1026: PetscReal afill;
1027: PetscMPIInt rank;
1028: Mat adpd, aopoth;
1029: MatType mtype;
1030: const char *prefix;
1032: PetscFunctionBegin;
1033: MatCheckProduct(C, 4);
1034: PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1035: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1036: PetscCallMPI(MPI_Comm_size(comm, &size));
1037: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1038: PetscCall(MatGetOwnershipRangeColumn(P, &p_colstart, &p_colend));
1040: /* create struct MatProductCtx_APMPI and attached it to C later */
1041: PetscCall(PetscNew(&ptap));
1043: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1044: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1046: /* get P_loc by taking all local rows of P */
1047: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1049: p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1050: pi_loc = p_loc->i;
1052: /* Allocate memory for the i arrays of the matrices A*P, A_diag*P_off and A_offd * P */
1053: PetscCall(PetscMalloc1(am + 1, &api));
1054: PetscCall(PetscMalloc1(am + 1, &adpoi));
1056: adpoi[0] = 0;
1057: ptap->api = api;
1058: api[0] = 0;
1060: /* create and initialize a linked list, will be used for both A_diag * P_loc_off and A_offd * P_oth */
1061: PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt));
1062: MatPreallocateBegin(comm, am, pn, dnz, onz);
1064: /* Symbolic calc of A_loc_diag * P_loc_diag */
1065: PetscCall(MatGetOptionsPrefix(A, &prefix));
1066: PetscCall(MatProductCreate(a->A, p->A, NULL, &adpd));
1067: PetscCall(MatGetOptionsPrefix(A, &prefix));
1068: PetscCall(MatSetOptionsPrefix(adpd, prefix));
1069: PetscCall(MatAppendOptionsPrefix(adpd, "inner_diag_"));
1071: PetscCall(MatProductSetType(adpd, MATPRODUCT_AB));
1072: PetscCall(MatProductSetAlgorithm(adpd, "sorted"));
1073: PetscCall(MatProductSetFill(adpd, fill));
1074: PetscCall(MatProductSetFromOptions(adpd));
1076: adpd->force_diagonals = C->force_diagonals;
1077: PetscCall(MatProductSymbolic(adpd));
1079: adpd_seq = (Mat_SeqAIJ *)((adpd)->data);
1080: adpdi = adpd_seq->i;
1081: adpdj = adpd_seq->j;
1082: p_off = (Mat_SeqAIJ *)p->B->data;
1083: poff_i = p_off->i;
1084: poff_j = p_off->j;
1086: /* j_temp stores indices of a result row before they are added to the linked list */
1087: PetscCall(PetscMalloc1(pN, &j_temp));
1089: /* Symbolic calc of the A_diag * p_loc_off */
1090: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
1091: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space_diag));
1092: current_space = free_space_diag;
1094: for (i = 0; i < am; i++) {
1095: /* A_diag * P_loc_off */
1096: nzi = adi[i + 1] - adi[i];
1097: for (j = 0; j < nzi; j++) {
1098: row = *adj++;
1099: pnz = poff_i[row + 1] - poff_i[row];
1100: Jptr = poff_j + poff_i[row];
1101: for (i1 = 0; i1 < pnz; i1++) j_temp[i1] = p->garray[Jptr[i1]];
1102: /* add non-zero cols of P into the sorted linked list lnk */
1103: PetscCall(PetscLLCondensedAddSorted(pnz, j_temp, lnk, lnkbt));
1104: }
1106: adponz = lnk[0];
1107: adpoi[i + 1] = adpoi[i] + adponz;
1109: /* if free space is not available, double the total space in the list */
1110: if (current_space->local_remaining < adponz) {
1111: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(adponz, current_space->total_array_size), ¤t_space));
1112: nspacedouble++;
1113: }
1115: /* Copy data into free space, then initialize lnk */
1116: PetscCall(PetscLLCondensedClean(pN, adponz, current_space->array, lnk, lnkbt));
1118: current_space->array += adponz;
1119: current_space->local_used += adponz;
1120: current_space->local_remaining -= adponz;
1121: }
1123: /* Symbolic calc of A_off * P_oth */
1124: PetscCall(MatSetOptionsPrefix(a->B, prefix));
1125: PetscCall(MatAppendOptionsPrefix(a->B, "inner_offdiag_"));
1126: PetscCall(MatCreate(PETSC_COMM_SELF, &aopoth));
1127: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(a->B, ptap->P_oth, fill, aopoth));
1128: aopoth_seq = (Mat_SeqAIJ *)((aopoth)->data);
1129: aopothi = aopoth_seq->i;
1130: aopothj = aopoth_seq->j;
1132: /* Allocate space for apj, adpj, aopj, ... */
1133: /* destroy lists of free space and other temporary array(s) */
1135: PetscCall(PetscMalloc1(aopothi[am] + adpoi[am] + adpdi[am], &ptap->apj));
1136: PetscCall(PetscMalloc1(adpoi[am], &adpoj));
1138: /* Copy from linked list to j-array */
1139: PetscCall(PetscFreeSpaceContiguous(&free_space_diag, adpoj));
1140: PetscCall(PetscLLDestroy(lnk, lnkbt));
1142: adpoJ = adpoj;
1143: adpdJ = adpdj;
1144: aopJ = aopothj;
1145: apj = ptap->apj;
1146: apJ = apj; /* still empty */
1148: /* Merge j-arrays of A_off * P, A_diag * P_loc_off, and */
1149: /* A_diag * P_loc_diag to get A*P */
1150: for (i = 0; i < am; i++) {
1151: aopnz = aopothi[i + 1] - aopothi[i];
1152: adponz = adpoi[i + 1] - adpoi[i];
1153: adpdnz = adpdi[i + 1] - adpdi[i];
1155: /* Correct indices from A_diag*P_diag */
1156: for (i1 = 0; i1 < adpdnz; i1++) adpdJ[i1] += p_colstart;
1157: /* Merge j-arrays of A_diag * P_loc_off and A_diag * P_loc_diag and A_off * P_oth */
1158: Merge3SortedArrays(adponz, adpoJ, adpdnz, adpdJ, aopnz, aopJ, &apnz, apJ);
1159: PetscCall(MatPreallocateSet(i + rstart, apnz, apJ, dnz, onz));
1161: aopJ += aopnz;
1162: adpoJ += adponz;
1163: adpdJ += adpdnz;
1164: apJ += apnz;
1165: api[i + 1] = api[i] + apnz;
1166: }
1168: /* malloc apa to store dense row A[i,:]*P */
1169: PetscCall(PetscCalloc1(pN, &ptap->apa));
1171: /* create and assemble symbolic parallel matrix C */
1172: PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1173: PetscCall(MatSetBlockSizesFromMats(C, A, P));
1174: PetscCall(MatGetType(A, &mtype));
1175: PetscCall(MatSetType(C, mtype));
1176: PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz));
1177: MatPreallocateEnd(dnz, onz);
1179: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api));
1180: PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
1181: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1182: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1183: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
1185: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1186: C->ops->productnumeric = MatProductNumeric_AB;
1188: /* attach the supporting struct to C for reuse */
1189: C->product->data = ptap;
1190: C->product->destroy = MatProductCtxDestroy_MPIAIJ_MatMatMult;
1192: /* set MatInfo */
1193: afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5;
1194: if (afill < 1.0) afill = 1.0;
1195: C->info.mallocs = nspacedouble;
1196: C->info.fill_ratio_given = fill;
1197: C->info.fill_ratio_needed = afill;
1199: #if defined(PETSC_USE_INFO)
1200: if (api[am]) {
1201: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill));
1202: PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1203: } else {
1204: PetscCall(PetscInfo(C, "Empty matrix product\n"));
1205: }
1206: #endif
1208: PetscCall(MatDestroy(&aopoth));
1209: PetscCall(MatDestroy(&adpd));
1210: PetscCall(PetscFree(j_temp));
1211: PetscCall(PetscFree(adpoj));
1212: PetscCall(PetscFree(adpoi));
1213: PetscFunctionReturn(PETSC_SUCCESS);
1214: }
1216: /* This routine only works when scall=MAT_REUSE_MATRIX! */
1217: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P, Mat A, Mat C)
1218: {
1219: MatProductCtx_APMPI *ptap;
1220: Mat Pt;
1222: PetscFunctionBegin;
1223: MatCheckProduct(C, 3);
1224: ptap = (MatProductCtx_APMPI *)C->product->data;
1225: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1226: PetscCheck(ptap->Pt, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()");
1228: Pt = ptap->Pt;
1229: PetscCall(MatTransposeSetPrecursor(P, Pt));
1230: PetscCall(MatTranspose(P, MAT_REUSE_MATRIX, &Pt));
1231: PetscCall(MatMatMultNumeric_MPIAIJ_MPIAIJ(Pt, A, C));
1232: PetscFunctionReturn(PETSC_SUCCESS);
1233: }
1235: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1236: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, PetscReal fill, Mat C)
1237: {
1238: MatProductCtx_APMPI *ptap;
1239: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1240: MPI_Comm comm;
1241: PetscMPIInt size, rank;
1242: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1243: PetscInt pn = P->cmap->n, aN = A->cmap->N, an = A->cmap->n;
1244: PetscInt *lnk, i, k, rstart;
1245: PetscBT lnkbt;
1246: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv, proc, nsend;
1247: PETSC_UNUSED PetscMPIInt icompleted = 0;
1248: PetscInt **buf_rj, **buf_ri, **buf_ri_k, row, ncols, *cols;
1249: PetscInt len, *dnz, *onz, *owners, nzi;
1250: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1251: MPI_Request *swaits, *rwaits;
1252: MPI_Status *sstatus, rstatus;
1253: PetscLayout rowmap;
1254: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1255: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
1256: PetscInt *Jptr, *prmap = p->garray, con, j, Crmax;
1257: Mat_SeqAIJ *a_loc, *c_loc, *c_oth;
1258: PetscHMapI ta;
1259: MatType mtype;
1260: const char *prefix;
1262: PetscFunctionBegin;
1263: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1264: PetscCallMPI(MPI_Comm_size(comm, &size));
1265: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1267: /* create symbolic parallel matrix C */
1268: PetscCall(MatGetType(A, &mtype));
1269: PetscCall(MatSetType(C, mtype));
1271: C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1273: /* create struct MatProductCtx_APMPI and attached it to C later */
1274: PetscCall(PetscNew(&ptap));
1276: /* (0) compute Rd = Pd^T, Ro = Po^T */
1277: PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1278: PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1280: /* (1) compute symbolic A_loc */
1281: PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &ptap->A_loc));
1283: /* (2-1) compute symbolic C_oth = Ro*A_loc */
1284: PetscCall(MatGetOptionsPrefix(A, &prefix));
1285: PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1286: PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1287: PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_oth));
1288: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro, ptap->A_loc, fill, ptap->C_oth));
1290: /* (3) send coj of C_oth to other processors */
1291: /* determine row ownership */
1292: PetscCall(PetscLayoutCreate(comm, &rowmap));
1293: rowmap->n = pn;
1294: rowmap->bs = 1;
1295: PetscCall(PetscLayoutSetUp(rowmap));
1296: owners = rowmap->range;
1298: /* determine the number of messages to send, their lengths */
1299: PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 1, &owners_co));
1300: PetscCall(PetscArrayzero(len_s, size));
1301: PetscCall(PetscArrayzero(len_si, size));
1303: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1304: coi = c_oth->i;
1305: coj = c_oth->j;
1306: con = ptap->C_oth->rmap->n;
1307: proc = 0;
1308: for (i = 0; i < con; i++) {
1309: while (prmap[i] >= owners[proc + 1]) proc++;
1310: len_si[proc]++; /* num of rows in Co(=Pt*A) to be sent to [proc] */
1311: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1312: }
1314: len = 0; /* max length of buf_si[], see (4) */
1315: owners_co[0] = 0;
1316: nsend = 0;
1317: for (proc = 0; proc < size; proc++) {
1318: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1319: if (len_s[proc]) {
1320: nsend++;
1321: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1322: len += len_si[proc];
1323: }
1324: }
1326: /* determine the number and length of messages to receive for coi and coj */
1327: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1328: PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1330: /* post the Irecv and Isend of coj */
1331: PetscCall(PetscCommGetNewTag(comm, &tagj));
1332: PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1333: PetscCall(PetscMalloc1(nsend, &swaits));
1334: for (proc = 0, k = 0; proc < size; proc++) {
1335: if (!len_s[proc]) continue;
1336: i = owners_co[proc];
1337: PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1338: k++;
1339: }
1341: /* (2-2) compute symbolic C_loc = Rd*A_loc */
1342: PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1343: PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1344: PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_loc));
1345: PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd, ptap->A_loc, fill, ptap->C_loc));
1346: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1348: /* receives coj are complete */
1349: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1350: PetscCall(PetscFree(rwaits));
1351: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1353: /* add received column indices into ta to update Crmax */
1354: a_loc = (Mat_SeqAIJ *)ptap->A_loc->data;
1356: /* create and initialize a linked list */
1357: PetscCall(PetscHMapICreateWithSize(an, &ta)); /* for compute Crmax */
1358: MatRowMergeMax_SeqAIJ(a_loc, ptap->A_loc->rmap->N, ta);
1360: for (k = 0; k < nrecv; k++) { /* k-th received message */
1361: Jptr = buf_rj[k];
1362: for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1363: }
1364: PetscCall(PetscHMapIGetSize(ta, &Crmax));
1365: PetscCall(PetscHMapIDestroy(&ta));
1367: /* (4) send and recv coi */
1368: PetscCall(PetscCommGetNewTag(comm, &tagi));
1369: PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1370: PetscCall(PetscMalloc1(len, &buf_s));
1371: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1372: for (proc = 0, k = 0; proc < size; proc++) {
1373: if (!len_s[proc]) continue;
1374: /* form outgoing message for i-structure:
1375: buf_si[0]: nrows to be sent
1376: [1:nrows]: row index (global)
1377: [nrows+1:2*nrows+1]: i-structure index
1378: */
1379: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1380: buf_si_i = buf_si + nrows + 1;
1381: buf_si[0] = nrows;
1382: buf_si_i[0] = 0;
1383: nrows = 0;
1384: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1385: nzi = coi[i + 1] - coi[i];
1386: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
1387: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
1388: nrows++;
1389: }
1390: PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1391: k++;
1392: buf_si += len_si[proc];
1393: }
1394: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1395: PetscCall(PetscFree(rwaits));
1396: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1398: PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1399: PetscCall(PetscFree(len_ri));
1400: PetscCall(PetscFree(swaits));
1401: PetscCall(PetscFree(buf_s));
1403: /* (5) compute the local portion of C */
1404: /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of C */
1405: PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1406: current_space = free_space;
1408: PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1409: for (k = 0; k < nrecv; k++) {
1410: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1411: nrows = *buf_ri_k[k];
1412: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1413: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1414: }
1416: MatPreallocateBegin(comm, pn, an, dnz, onz);
1417: PetscCall(PetscLLCondensedCreate(Crmax, aN, &lnk, &lnkbt));
1418: for (i = 0; i < pn; i++) { /* for each local row of C */
1419: /* add C_loc into C */
1420: nzi = c_loc->i[i + 1] - c_loc->i[i];
1421: Jptr = c_loc->j + c_loc->i[i];
1422: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1424: /* add received col data into lnk */
1425: for (k = 0; k < nrecv; k++) { /* k-th received message */
1426: if (i == *nextrow[k]) { /* i-th row */
1427: nzi = *(nextci[k] + 1) - *nextci[k];
1428: Jptr = buf_rj[k] + *nextci[k];
1429: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1430: nextrow[k]++;
1431: nextci[k]++;
1432: }
1433: }
1435: /* add missing diagonal entry */
1436: if (C->force_diagonals) {
1437: k = i + owners[rank]; /* column index */
1438: PetscCall(PetscLLCondensedAddSorted(1, &k, lnk, lnkbt));
1439: }
1441: nzi = lnk[0];
1443: /* copy data into free space, then initialize lnk */
1444: PetscCall(PetscLLCondensedClean(aN, nzi, current_space->array, lnk, lnkbt));
1445: PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1446: }
1447: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1448: PetscCall(PetscLLDestroy(lnk, lnkbt));
1449: PetscCall(PetscFreeSpaceDestroy(free_space));
1451: /* local sizes and preallocation */
1452: PetscCall(MatSetSizes(C, pn, an, PETSC_DETERMINE, PETSC_DETERMINE));
1453: PetscCall(PetscLayoutSetBlockSize(C->rmap, P->cmap->bs));
1454: PetscCall(PetscLayoutSetBlockSize(C->cmap, A->cmap->bs));
1455: PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz));
1456: MatPreallocateEnd(dnz, onz);
1458: /* add C_loc and C_oth to C */
1459: PetscCall(MatGetOwnershipRange(C, &rstart, NULL));
1460: for (i = 0; i < pn; i++) {
1461: ncols = c_loc->i[i + 1] - c_loc->i[i];
1462: cols = c_loc->j + c_loc->i[i];
1463: row = rstart + i;
1464: PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES));
1466: if (C->force_diagonals) PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, 1, (const PetscInt *)&row, NULL, INSERT_VALUES));
1467: }
1468: for (i = 0; i < con; i++) {
1469: ncols = c_oth->i[i + 1] - c_oth->i[i];
1470: cols = c_oth->j + c_oth->i[i];
1471: row = prmap[i];
1472: PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES));
1473: }
1474: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1475: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1476: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
1478: /* members in merge */
1479: PetscCall(PetscFree(id_r));
1480: PetscCall(PetscFree(len_r));
1481: PetscCall(PetscFree(buf_ri[0]));
1482: PetscCall(PetscFree(buf_ri));
1483: PetscCall(PetscFree(buf_rj[0]));
1484: PetscCall(PetscFree(buf_rj));
1485: PetscCall(PetscLayoutDestroy(&rowmap));
1487: /* attach the supporting struct to C for reuse */
1488: C->product->data = ptap;
1489: C->product->destroy = MatProductCtxDestroy_MPIAIJ_PtAP;
1490: PetscFunctionReturn(PETSC_SUCCESS);
1491: }
1493: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, Mat C)
1494: {
1495: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1496: Mat_SeqAIJ *c_seq;
1497: MatProductCtx_APMPI *ptap;
1498: Mat A_loc, C_loc, C_oth;
1499: PetscInt i, rstart, rend, cm, ncols, row;
1500: const PetscInt *cols;
1501: const PetscScalar *vals;
1503: PetscFunctionBegin;
1504: MatCheckProduct(C, 3);
1505: ptap = (MatProductCtx_APMPI *)C->product->data;
1506: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1507: PetscCheck(ptap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()");
1508: PetscCall(MatZeroEntries(C));
1510: /* These matrices are obtained in MatTransposeMatMultSymbolic() */
1511: /* 1) get R = Pd^T, Ro = Po^T */
1512: PetscCall(MatTransposeSetPrecursor(p->A, ptap->Rd));
1513: PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1514: PetscCall(MatTransposeSetPrecursor(p->B, ptap->Ro));
1515: PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1517: /* 2) compute numeric A_loc */
1518: PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &ptap->A_loc));
1520: /* 3) C_loc = Rd*A_loc, C_oth = Ro*A_loc */
1521: A_loc = ptap->A_loc;
1522: PetscCall(ptap->C_loc->ops->matmultnumeric(ptap->Rd, A_loc, ptap->C_loc));
1523: PetscCall(ptap->C_oth->ops->matmultnumeric(ptap->Ro, A_loc, ptap->C_oth));
1524: C_loc = ptap->C_loc;
1525: C_oth = ptap->C_oth;
1527: /* add C_loc and C_oth to C */
1528: PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1530: /* C_loc -> C */
1531: cm = C_loc->rmap->N;
1532: c_seq = (Mat_SeqAIJ *)C_loc->data;
1533: cols = c_seq->j;
1534: vals = c_seq->a;
1535: for (i = 0; i < cm; i++) {
1536: ncols = c_seq->i[i + 1] - c_seq->i[i];
1537: row = rstart + i;
1538: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1539: cols += ncols;
1540: vals += ncols;
1541: }
1543: /* Co -> C, off-processor part */
1544: cm = C_oth->rmap->N;
1545: c_seq = (Mat_SeqAIJ *)C_oth->data;
1546: cols = c_seq->j;
1547: vals = c_seq->a;
1548: for (i = 0; i < cm; i++) {
1549: ncols = c_seq->i[i + 1] - c_seq->i[i];
1550: row = p->garray[i];
1551: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1552: cols += ncols;
1553: vals += ncols;
1554: }
1555: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1556: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1557: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
1558: PetscFunctionReturn(PETSC_SUCCESS);
1559: }
1561: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P, Mat A, Mat C)
1562: {
1563: MatMergeSeqsToMPI *merge;
1564: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1565: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1566: MatProductCtx_APMPI *ap;
1567: PetscInt *adj;
1568: PetscInt i, j, k, anz, pnz, row, *cj, nexta;
1569: MatScalar *ada, *ca, valtmp;
1570: PetscInt am = A->rmap->n, cm = C->rmap->n, pon = (p->B)->cmap->n;
1571: MPI_Comm comm;
1572: PetscMPIInt size, rank, taga, *len_s, proc;
1573: PetscInt *owners, nrows, **buf_ri_k, **nextrow, **nextci;
1574: PetscInt **buf_ri, **buf_rj;
1575: PetscInt cnz = 0, *bj_i, *bi, *bj, bnz, nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1576: MPI_Request *s_waits, *r_waits;
1577: MPI_Status *status;
1578: MatScalar **abuf_r, *ba_i, *pA, *coa, *ba;
1579: const PetscScalar *dummy;
1580: PetscInt *ai, *aj, *coi, *coj, *poJ, *pdJ;
1581: Mat A_loc;
1582: Mat_SeqAIJ *a_loc;
1584: PetscFunctionBegin;
1585: MatCheckProduct(C, 3);
1586: ap = (MatProductCtx_APMPI *)C->product->data;
1587: PetscCheck(ap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be computed. Missing data");
1588: PetscCheck(ap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()");
1589: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1590: PetscCallMPI(MPI_Comm_size(comm, &size));
1591: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1593: merge = ap->merge;
1595: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1596: /* get data from symbolic products */
1597: coi = merge->coi;
1598: coj = merge->coj;
1599: PetscCall(PetscCalloc1(coi[pon], &coa));
1600: bi = merge->bi;
1601: bj = merge->bj;
1602: owners = merge->rowmap->range;
1603: PetscCall(PetscCalloc1(bi[cm], &ba));
1605: /* get A_loc by taking all local rows of A */
1606: A_loc = ap->A_loc;
1607: PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &A_loc));
1608: a_loc = (Mat_SeqAIJ *)A_loc->data;
1609: ai = a_loc->i;
1610: aj = a_loc->j;
1612: /* trigger copy to CPU */
1613: PetscCall(MatSeqAIJGetArrayRead(p->A, &dummy));
1614: PetscCall(MatSeqAIJRestoreArrayRead(p->A, &dummy));
1615: PetscCall(MatSeqAIJGetArrayRead(p->B, &dummy));
1616: PetscCall(MatSeqAIJRestoreArrayRead(p->B, &dummy));
1617: for (i = 0; i < am; i++) {
1618: anz = ai[i + 1] - ai[i];
1619: adj = aj + ai[i];
1620: ada = a_loc->a + ai[i];
1622: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1623: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1624: pnz = po->i[i + 1] - po->i[i];
1625: poJ = po->j + po->i[i];
1626: pA = po->a + po->i[i];
1627: for (j = 0; j < pnz; j++) {
1628: row = poJ[j];
1629: cj = coj + coi[row];
1630: ca = coa + coi[row];
1631: /* perform sparse axpy */
1632: nexta = 0;
1633: valtmp = pA[j];
1634: for (k = 0; nexta < anz; k++) {
1635: if (cj[k] == adj[nexta]) {
1636: ca[k] += valtmp * ada[nexta];
1637: nexta++;
1638: }
1639: }
1640: PetscCall(PetscLogFlops(2.0 * anz));
1641: }
1643: /* put the value into Cd (diagonal part) */
1644: pnz = pd->i[i + 1] - pd->i[i];
1645: pdJ = pd->j + pd->i[i];
1646: pA = pd->a + pd->i[i];
1647: for (j = 0; j < pnz; j++) {
1648: row = pdJ[j];
1649: cj = bj + bi[row];
1650: ca = ba + bi[row];
1651: /* perform sparse axpy */
1652: nexta = 0;
1653: valtmp = pA[j];
1654: for (k = 0; nexta < anz; k++) {
1655: if (cj[k] == adj[nexta]) {
1656: ca[k] += valtmp * ada[nexta];
1657: nexta++;
1658: }
1659: }
1660: PetscCall(PetscLogFlops(2.0 * anz));
1661: }
1662: }
1664: /* 3) send and recv matrix values coa */
1665: buf_ri = merge->buf_ri;
1666: buf_rj = merge->buf_rj;
1667: len_s = merge->len_s;
1668: PetscCall(PetscCommGetNewTag(comm, &taga));
1669: PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
1671: PetscCall(PetscMalloc2(merge->nsend, &s_waits, size, &status));
1672: for (proc = 0, k = 0; proc < size; proc++) {
1673: if (!len_s[proc]) continue;
1674: i = merge->owners_co[proc];
1675: PetscCallMPI(MPIU_Isend(coa + coi[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
1676: k++;
1677: }
1678: if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
1679: if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
1681: PetscCall(PetscFree2(s_waits, status));
1682: PetscCall(PetscFree(r_waits));
1683: PetscCall(PetscFree(coa));
1685: /* 4) insert local Cseq and received values into Cmpi */
1686: PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci));
1687: for (k = 0; k < merge->nrecv; k++) {
1688: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1689: nrows = *buf_ri_k[k];
1690: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1691: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1692: }
1694: for (i = 0; i < cm; i++) {
1695: row = owners[rank] + i; /* global row index of C_seq */
1696: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1697: ba_i = ba + bi[i];
1698: bnz = bi[i + 1] - bi[i];
1699: /* add received vals into ba */
1700: for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
1701: /* i-th row */
1702: if (i == *nextrow[k]) {
1703: cnz = *(nextci[k] + 1) - *nextci[k];
1704: cj = buf_rj[k] + *nextci[k];
1705: ca = abuf_r[k] + *nextci[k];
1706: nextcj = 0;
1707: for (j = 0; nextcj < cnz; j++) {
1708: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1709: ba_i[j] += ca[nextcj++];
1710: }
1711: }
1712: nextrow[k]++;
1713: nextci[k]++;
1714: PetscCall(PetscLogFlops(2.0 * cnz));
1715: }
1716: }
1717: PetscCall(MatSetValues(C, 1, &row, bnz, bj_i, ba_i, INSERT_VALUES));
1718: }
1719: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1720: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1722: PetscCall(PetscFree(ba));
1723: PetscCall(PetscFree(abuf_r[0]));
1724: PetscCall(PetscFree(abuf_r));
1725: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1726: PetscFunctionReturn(PETSC_SUCCESS);
1727: }
1729: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P, Mat A, PetscReal fill, Mat C)
1730: {
1731: Mat A_loc;
1732: MatProductCtx_APMPI *ap;
1733: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1734: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *a = (Mat_MPIAIJ *)A->data;
1735: PetscInt *pdti, *pdtj, *poti, *potj, *ptJ;
1736: PetscInt nnz;
1737: PetscInt *lnk, *owners_co, *coi, *coj, i, k, pnz, row;
1738: PetscInt am = A->rmap->n, pn = P->cmap->n;
1739: MPI_Comm comm;
1740: PetscMPIInt size, rank, tagi, tagj, *len_si, *len_s, *len_ri, proc;
1741: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
1742: PetscInt len, *dnz, *onz, *owners;
1743: PetscInt nzi, *bi, *bj;
1744: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1745: MPI_Request *swaits, *rwaits;
1746: MPI_Status *sstatus, rstatus;
1747: MatMergeSeqsToMPI *merge;
1748: PetscInt *ai, *aj, *Jptr, anz, *prmap = p->garray, pon, nspacedouble = 0, j;
1749: PetscReal afill = 1.0, afill_tmp;
1750: PetscInt rstart = P->cmap->rstart, rmax, Armax;
1751: Mat_SeqAIJ *a_loc;
1752: PetscHMapI ta;
1753: MatType mtype;
1755: PetscFunctionBegin;
1756: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1757: /* check if matrix local sizes are compatible */
1758: PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != P (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart,
1759: A->rmap->rend, P->rmap->rstart, P->rmap->rend);
1761: PetscCallMPI(MPI_Comm_size(comm, &size));
1762: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1764: /* create struct MatProductCtx_APMPI and attached it to C later */
1765: PetscCall(PetscNew(&ap));
1767: /* get A_loc by taking all local rows of A */
1768: PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &A_loc));
1770: ap->A_loc = A_loc;
1771: a_loc = (Mat_SeqAIJ *)A_loc->data;
1772: ai = a_loc->i;
1773: aj = a_loc->j;
1775: /* determine symbolic Co=(p->B)^T*A - send to others */
1776: PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj));
1777: PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->B, &poti, &potj));
1778: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1779: >= (num of nonzero rows of C_seq) - pn */
1780: PetscCall(PetscMalloc1(pon + 1, &coi));
1781: coi[0] = 0;
1783: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1784: nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(poti[pon], ai[am]));
1785: PetscCall(PetscFreeSpaceGet(nnz, &free_space));
1786: current_space = free_space;
1788: /* create and initialize a linked list */
1789: PetscCall(PetscHMapICreateWithSize(A->cmap->n + a->B->cmap->N, &ta));
1790: MatRowMergeMax_SeqAIJ(a_loc, am, ta);
1791: PetscCall(PetscHMapIGetSize(ta, &Armax));
1793: PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk));
1795: for (i = 0; i < pon; i++) {
1796: pnz = poti[i + 1] - poti[i];
1797: ptJ = potj + poti[i];
1798: for (j = 0; j < pnz; j++) {
1799: row = ptJ[j]; /* row of A_loc == col of Pot */
1800: anz = ai[row + 1] - ai[row];
1801: Jptr = aj + ai[row];
1802: /* add non-zero cols of AP into the sorted linked list lnk */
1803: PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk));
1804: }
1805: nnz = lnk[0];
1807: /* If free space is not available, double the total space in the list */
1808: if (current_space->local_remaining < nnz) {
1809: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space));
1810: nspacedouble++;
1811: }
1813: /* Copy data into free space, and zero out denserows */
1814: PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk));
1816: current_space->array += nnz;
1817: current_space->local_used += nnz;
1818: current_space->local_remaining -= nnz;
1820: coi[i + 1] = coi[i] + nnz;
1821: }
1823: PetscCall(PetscMalloc1(coi[pon], &coj));
1824: PetscCall(PetscFreeSpaceContiguous(&free_space, coj));
1825: PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); /* must destroy to get a new one for C */
1827: afill_tmp = (PetscReal)coi[pon] / (poti[pon] + ai[am] + 1);
1828: if (afill_tmp > afill) afill = afill_tmp;
1830: /* send j-array (coj) of Co to other processors */
1831: /* determine row ownership */
1832: PetscCall(PetscNew(&merge));
1833: PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
1835: merge->rowmap->n = pn;
1836: merge->rowmap->bs = 1;
1838: PetscCall(PetscLayoutSetUp(merge->rowmap));
1839: owners = merge->rowmap->range;
1841: /* determine the number of messages to send, their lengths */
1842: PetscCall(PetscCalloc1(size, &len_si));
1843: PetscCall(PetscCalloc1(size, &merge->len_s));
1845: len_s = merge->len_s;
1846: merge->nsend = 0;
1848: PetscCall(PetscMalloc1(size + 1, &owners_co));
1850: proc = 0;
1851: for (i = 0; i < pon; i++) {
1852: while (prmap[i] >= owners[proc + 1]) proc++;
1853: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1854: len_s[proc] += coi[i + 1] - coi[i];
1855: }
1857: len = 0; /* max length of buf_si[] */
1858: owners_co[0] = 0;
1859: for (proc = 0; proc < size; proc++) {
1860: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1861: if (len_s[proc]) {
1862: merge->nsend++;
1863: len_si[proc] = 2 * (len_si[proc] + 1);
1864: len += len_si[proc];
1865: }
1866: }
1868: /* determine the number and length of messages to receive for coi and coj */
1869: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
1870: PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
1872: /* post the Irecv and Isend of coj */
1873: PetscCall(PetscCommGetNewTag(comm, &tagj));
1874: PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rwaits));
1875: PetscCall(PetscMalloc1(merge->nsend, &swaits));
1876: for (proc = 0, k = 0; proc < size; proc++) {
1877: if (!len_s[proc]) continue;
1878: i = owners_co[proc];
1879: PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1880: k++;
1881: }
1883: /* receives and sends of coj are complete */
1884: PetscCall(PetscMalloc1(size, &sstatus));
1885: for (i = 0; i < merge->nrecv; i++) {
1886: PETSC_UNUSED PetscMPIInt icompleted;
1887: PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus));
1888: }
1889: PetscCall(PetscFree(rwaits));
1890: if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus));
1892: /* add received column indices into table to update Armax */
1893: /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/tutorials/ex56.c! */
1894: for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
1895: Jptr = buf_rj[k];
1896: for (j = 0; j < merge->len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1897: }
1898: PetscCall(PetscHMapIGetSize(ta, &Armax));
1900: /* send and recv coi */
1901: PetscCall(PetscCommGetNewTag(comm, &tagi));
1902: PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &rwaits));
1903: PetscCall(PetscMalloc1(len, &buf_s));
1904: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1905: for (proc = 0, k = 0; proc < size; proc++) {
1906: if (!len_s[proc]) continue;
1907: /* form outgoing message for i-structure:
1908: buf_si[0]: nrows to be sent
1909: [1:nrows]: row index (global)
1910: [nrows+1:2*nrows+1]: i-structure index
1911: */
1912: nrows = len_si[proc] / 2 - 1;
1913: buf_si_i = buf_si + nrows + 1;
1914: buf_si[0] = nrows;
1915: buf_si_i[0] = 0;
1916: nrows = 0;
1917: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1918: nzi = coi[i + 1] - coi[i];
1919: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
1920: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
1921: nrows++;
1922: }
1923: PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1924: k++;
1925: buf_si += len_si[proc];
1926: }
1927: i = merge->nrecv;
1928: while (i--) {
1929: PETSC_UNUSED PetscMPIInt icompleted;
1930: PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus));
1931: }
1932: PetscCall(PetscFree(rwaits));
1933: if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus));
1934: PetscCall(PetscFree(len_si));
1935: PetscCall(PetscFree(len_ri));
1936: PetscCall(PetscFree(swaits));
1937: PetscCall(PetscFree(sstatus));
1938: PetscCall(PetscFree(buf_s));
1940: /* compute the local portion of C (mpi mat) */
1941: /* allocate bi array and free space for accumulating nonzero column info */
1942: PetscCall(PetscMalloc1(pn + 1, &bi));
1943: bi[0] = 0;
1945: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1946: nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(pdti[pn], PetscIntSumTruncate(poti[pon], ai[am])));
1947: PetscCall(PetscFreeSpaceGet(nnz, &free_space));
1948: current_space = free_space;
1950: PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci));
1951: for (k = 0; k < merge->nrecv; k++) {
1952: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1953: nrows = *buf_ri_k[k];
1954: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1955: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th received i-structure */
1956: }
1958: PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk));
1959: MatPreallocateBegin(comm, pn, A->cmap->n, dnz, onz);
1960: rmax = 0;
1961: for (i = 0; i < pn; i++) {
1962: /* add pdt[i,:]*AP into lnk */
1963: pnz = pdti[i + 1] - pdti[i];
1964: ptJ = pdtj + pdti[i];
1965: for (j = 0; j < pnz; j++) {
1966: row = ptJ[j]; /* row of AP == col of Pt */
1967: anz = ai[row + 1] - ai[row];
1968: Jptr = aj + ai[row];
1969: /* add non-zero cols of AP into the sorted linked list lnk */
1970: PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk));
1971: }
1973: /* add received col data into lnk */
1974: for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
1975: if (i == *nextrow[k]) { /* i-th row */
1976: nzi = *(nextci[k] + 1) - *nextci[k];
1977: Jptr = buf_rj[k] + *nextci[k];
1978: PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
1979: nextrow[k]++;
1980: nextci[k]++;
1981: }
1982: }
1984: /* add missing diagonal entry */
1985: if (C->force_diagonals) {
1986: k = i + owners[rank]; /* column index */
1987: PetscCall(PetscLLCondensedAddSorted_Scalable(1, &k, lnk));
1988: }
1990: nnz = lnk[0];
1992: /* if free space is not available, make more free space */
1993: if (current_space->local_remaining < nnz) {
1994: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space));
1995: nspacedouble++;
1996: }
1997: /* copy data into free space, then initialize lnk */
1998: PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk));
1999: PetscCall(MatPreallocateSet(i + owners[rank], nnz, current_space->array, dnz, onz));
2001: current_space->array += nnz;
2002: current_space->local_used += nnz;
2003: current_space->local_remaining -= nnz;
2005: bi[i + 1] = bi[i] + nnz;
2006: if (nnz > rmax) rmax = nnz;
2007: }
2008: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
2010: PetscCall(PetscMalloc1(bi[pn], &bj));
2011: PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
2012: afill_tmp = (PetscReal)bi[pn] / (pdti[pn] + poti[pon] + ai[am] + 1);
2013: if (afill_tmp > afill) afill = afill_tmp;
2014: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
2015: PetscCall(PetscHMapIDestroy(&ta));
2016: PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj));
2017: PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->B, &poti, &potj));
2019: /* create symbolic parallel matrix C - why cannot be assembled in Numeric part */
2020: PetscCall(MatSetSizes(C, pn, A->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
2021: PetscCall(MatSetBlockSizes(C, P->cmap->bs, A->cmap->bs));
2022: PetscCall(MatGetType(A, &mtype));
2023: PetscCall(MatSetType(C, mtype));
2024: PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz));
2025: MatPreallocateEnd(dnz, onz);
2026: PetscCall(MatSetBlockSize(C, 1));
2027: PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
2028: for (i = 0; i < pn; i++) {
2029: row = i + rstart;
2030: nnz = bi[i + 1] - bi[i];
2031: Jptr = bj + bi[i];
2032: PetscCall(MatSetValues(C, 1, &row, nnz, Jptr, NULL, INSERT_VALUES));
2033: }
2034: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2035: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
2036: PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
2037: merge->bi = bi;
2038: merge->bj = bj;
2039: merge->coi = coi;
2040: merge->coj = coj;
2041: merge->buf_ri = buf_ri;
2042: merge->buf_rj = buf_rj;
2043: merge->owners_co = owners_co;
2045: /* attach the supporting struct to C for reuse */
2046: C->product->data = ap;
2047: C->product->destroy = MatProductCtxDestroy_MPIAIJ_PtAP;
2048: ap->merge = merge;
2050: C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
2052: #if defined(PETSC_USE_INFO)
2053: if (bi[pn] != 0) {
2054: PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill));
2055: PetscCall(PetscInfo(C, "Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n", (double)afill));
2056: } else {
2057: PetscCall(PetscInfo(C, "Empty matrix product\n"));
2058: }
2059: #endif
2060: PetscFunctionReturn(PETSC_SUCCESS);
2061: }
2063: static PetscErrorCode MatProductSymbolic_AtB_MPIAIJ_MPIAIJ(Mat C)
2064: {
2065: Mat_Product *product = C->product;
2066: Mat A = product->A, B = product->B;
2067: PetscReal fill = product->fill;
2068: PetscBool flg;
2070: PetscFunctionBegin;
2071: /* scalable */
2072: PetscCall(PetscStrcmp(product->alg, "scalable", &flg));
2073: if (flg) {
2074: PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C));
2075: goto next;
2076: }
2078: /* nonscalable */
2079: PetscCall(PetscStrcmp(product->alg, "nonscalable", &flg));
2080: if (flg) {
2081: PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C));
2082: goto next;
2083: }
2085: /* matmatmult */
2086: PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
2087: if (flg) {
2088: Mat At;
2089: MatProductCtx_APMPI *ptap;
2091: PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
2092: PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(At, B, fill, C));
2093: ptap = (MatProductCtx_APMPI *)C->product->data;
2094: if (ptap) {
2095: ptap->Pt = At;
2096: C->product->destroy = MatProductCtxDestroy_MPIAIJ_PtAP;
2097: }
2098: C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
2099: goto next;
2100: }
2102: /* backend general code */
2103: PetscCall(PetscStrcmp(product->alg, "backend", &flg));
2104: if (flg) {
2105: PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2106: PetscFunctionReturn(PETSC_SUCCESS);
2107: }
2109: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type is not supported");
2111: next:
2112: C->ops->productnumeric = MatProductNumeric_AtB;
2113: PetscFunctionReturn(PETSC_SUCCESS);
2114: }
2116: /* Set options for MatMatMultxxx_MPIAIJ_MPIAIJ */
2117: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AB(Mat C)
2118: {
2119: Mat_Product *product = C->product;
2120: Mat A = product->A, B = product->B;
2121: #if defined(PETSC_HAVE_HYPRE)
2122: const char *algTypes[5] = {"scalable", "nonscalable", "seqmpi", "backend", "hypre"};
2123: PetscInt nalg = 5;
2124: #else
2125: const char *algTypes[4] = {
2126: "scalable",
2127: "nonscalable",
2128: "seqmpi",
2129: "backend",
2130: };
2131: PetscInt nalg = 4;
2132: #endif
2133: PetscInt alg = 1; /* set nonscalable algorithm as default */
2134: PetscBool flg;
2135: MPI_Comm comm;
2137: PetscFunctionBegin;
2138: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2140: /* Set "nonscalable" as default algorithm */
2141: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2142: if (flg) {
2143: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2145: /* Set "scalable" as default if BN and local nonzeros of A and B are large */
2146: if (B->cmap->N > 100000) { /* may switch to scalable algorithm as default */
2147: MatInfo Ainfo, Binfo;
2148: PetscInt nz_local;
2149: PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable;
2151: PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo));
2152: PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo));
2153: nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated);
2155: if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE;
2156: PetscCallMPI(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPI_C_BOOL, MPI_LOR, comm));
2158: if (alg_scalable) {
2159: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
2160: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2161: PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local)));
2162: }
2163: }
2164: }
2166: /* Get runtime option */
2167: if (product->api_user) {
2168: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
2169: PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2170: PetscOptionsEnd();
2171: } else {
2172: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2173: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2174: PetscOptionsEnd();
2175: }
2176: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2178: C->ops->productsymbolic = MatProductSymbolic_AB_MPIAIJ_MPIAIJ;
2179: PetscFunctionReturn(PETSC_SUCCESS);
2180: }
2182: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABt(Mat C)
2183: {
2184: PetscFunctionBegin;
2185: PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C));
2186: C->ops->productsymbolic = MatProductSymbolic_ABt_MPIAIJ_MPIAIJ;
2187: PetscFunctionReturn(PETSC_SUCCESS);
2188: }
2190: /* Set options for MatTransposeMatMultXXX_MPIAIJ_MPIAIJ */
2191: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AtB(Mat C)
2192: {
2193: Mat_Product *product = C->product;
2194: Mat A = product->A, B = product->B;
2195: const char *algTypes[4] = {"scalable", "nonscalable", "at*b", "backend"};
2196: PetscInt nalg = 4;
2197: PetscInt alg = 1; /* set default algorithm */
2198: PetscBool flg;
2199: MPI_Comm comm;
2201: PetscFunctionBegin;
2202: /* Check matrix local sizes */
2203: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2204: PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")",
2205: A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
2207: /* Set default algorithm */
2208: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2209: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2211: /* Set "scalable" as default if BN and local nonzeros of A and B are large */
2212: if (alg && B->cmap->N > 100000) { /* may switch to scalable algorithm as default */
2213: MatInfo Ainfo, Binfo;
2214: PetscInt nz_local;
2215: PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable;
2217: PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo));
2218: PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo));
2219: nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated);
2221: if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE;
2222: PetscCallMPI(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPI_C_BOOL, MPI_LOR, comm));
2224: if (alg_scalable) {
2225: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
2226: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2227: PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local)));
2228: }
2229: }
2231: /* Get runtime option */
2232: if (product->api_user) {
2233: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2234: PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2235: PetscOptionsEnd();
2236: } else {
2237: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2238: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2239: PetscOptionsEnd();
2240: }
2241: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2243: C->ops->productsymbolic = MatProductSymbolic_AtB_MPIAIJ_MPIAIJ;
2244: PetscFunctionReturn(PETSC_SUCCESS);
2245: }
2247: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_PtAP(Mat C)
2248: {
2249: Mat_Product *product = C->product;
2250: Mat A = product->A, P = product->B;
2251: MPI_Comm comm;
2252: PetscBool flg;
2253: PetscInt alg = 1; /* set default algorithm */
2254: #if !defined(PETSC_HAVE_HYPRE)
2255: const char *algTypes[5] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend"};
2256: PetscInt nalg = 5;
2257: #else
2258: const char *algTypes[6] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend", "hypre"};
2259: PetscInt nalg = 6;
2260: #endif
2261: PetscInt pN = P->cmap->N;
2263: PetscFunctionBegin;
2264: /* Check matrix local sizes */
2265: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2266: PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Arow (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")",
2267: A->rmap->rstart, A->rmap->rend, P->rmap->rstart, P->rmap->rend);
2268: PetscCheck(A->cmap->rstart == P->rmap->rstart && A->cmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Acol (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")",
2269: A->cmap->rstart, A->cmap->rend, P->rmap->rstart, P->rmap->rend);
2271: /* Set "nonscalable" as default algorithm */
2272: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2273: if (flg) {
2274: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2276: /* Set "scalable" as default if BN and local nonzeros of A and B are large */
2277: if (pN > 100000) {
2278: MatInfo Ainfo, Pinfo;
2279: PetscInt nz_local;
2280: PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable;
2282: PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo));
2283: PetscCall(MatGetInfo(P, MAT_LOCAL, &Pinfo));
2284: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
2286: if (pN > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE;
2287: PetscCallMPI(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPI_C_BOOL, MPI_LOR, comm));
2289: if (alg_scalable) {
2290: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
2291: PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2292: }
2293: }
2294: }
2296: /* Get runtime option */
2297: if (product->api_user) {
2298: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2299: PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
2300: PetscOptionsEnd();
2301: } else {
2302: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2303: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
2304: PetscOptionsEnd();
2305: }
2306: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2308: C->ops->productsymbolic = MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ;
2309: PetscFunctionReturn(PETSC_SUCCESS);
2310: }
2312: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_RARt(Mat C)
2313: {
2314: Mat_Product *product = C->product;
2315: Mat A = product->A, R = product->B;
2317: PetscFunctionBegin;
2318: /* Check matrix local sizes */
2319: PetscCheck(A->cmap->n == R->cmap->n && A->rmap->n == R->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A local (%" PetscInt_FMT ", %" PetscInt_FMT "), R local (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->n,
2320: A->rmap->n, R->rmap->n, R->cmap->n);
2322: C->ops->productsymbolic = MatProductSymbolic_RARt_MPIAIJ_MPIAIJ;
2323: PetscFunctionReturn(PETSC_SUCCESS);
2324: }
2326: /*
2327: Set options for ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm
2328: */
2329: static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABC(Mat C)
2330: {
2331: Mat_Product *product = C->product;
2332: PetscBool flg = PETSC_FALSE;
2333: PetscInt alg = 1; /* default algorithm */
2334: const char *algTypes[3] = {"scalable", "nonscalable", "seqmpi"};
2335: PetscInt nalg = 3;
2337: PetscFunctionBegin;
2338: /* Set default algorithm */
2339: PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2340: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2342: /* Get runtime option */
2343: if (product->api_user) {
2344: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2345: PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2346: PetscOptionsEnd();
2347: } else {
2348: PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2349: PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2350: PetscOptionsEnd();
2351: }
2352: if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2354: C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_MPIAIJ_MPIAIJ_MPIAIJ;
2355: C->ops->productsymbolic = MatProductSymbolic_ABC;
2356: PetscFunctionReturn(PETSC_SUCCESS);
2357: }
2359: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ(Mat C)
2360: {
2361: Mat_Product *product = C->product;
2363: PetscFunctionBegin;
2364: switch (product->type) {
2365: case MATPRODUCT_AB:
2366: PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C));
2367: break;
2368: case MATPRODUCT_ABt:
2369: PetscCall(MatProductSetFromOptions_MPIAIJ_ABt(C));
2370: break;
2371: case MATPRODUCT_AtB:
2372: PetscCall(MatProductSetFromOptions_MPIAIJ_AtB(C));
2373: break;
2374: case MATPRODUCT_PtAP:
2375: PetscCall(MatProductSetFromOptions_MPIAIJ_PtAP(C));
2376: break;
2377: case MATPRODUCT_RARt:
2378: PetscCall(MatProductSetFromOptions_MPIAIJ_RARt(C));
2379: break;
2380: case MATPRODUCT_ABC:
2381: PetscCall(MatProductSetFromOptions_MPIAIJ_ABC(C));
2382: break;
2383: default:
2384: break;
2385: }
2386: PetscFunctionReturn(PETSC_SUCCESS);
2387: }