Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
21: /*
22: Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
24: The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25: that is a very major recoding job.
27: Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28: */
29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30: {
31: PetscFunctionBegin;
32: for (PetscInt i = 0; i < imax; i++) {
33: if (!is[i]) break;
34: PetscInt n = 0, N, Nmax, Nmin;
35: const PetscInt *idx;
36: PetscInt *nidx = NULL;
37: MPI_Comm comm;
38: PetscBT bt;
40: PetscCall(ISGetLocalSize(is[i], &N));
41: if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42: PetscCall(ISGetIndices(is[i], &idx));
43: PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44: Nmin = Nmin / bs;
45: Nmax = Nmax / bs;
46: PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47: for (PetscInt j = 0; j < N; j++) {
48: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49: }
50: PetscCall(PetscMalloc1(n, &nidx));
51: n = 0;
52: PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53: for (PetscInt j = 0; j < N; j++) {
54: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55: }
56: PetscCall(PetscBTDestroy(&bt));
57: PetscCall(ISRestoreIndices(is[i], &idx));
58: }
59: PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60: PetscCall(ISDestroy(is + i));
61: PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62: }
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67: {
68: PetscInt i;
70: PetscFunctionBegin;
71: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74: PetscFunctionReturn(PETSC_SUCCESS);
75: }
77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78: {
79: PetscInt i;
81: PetscFunctionBegin;
82: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85: PetscFunctionReturn(PETSC_SUCCESS);
86: }
88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89: {
90: MPI_Comm comm;
91: PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j;
92: PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93: PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94: const PetscInt *indices_i, **indices;
95: PetscLayout rmap;
96: PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks;
97: PetscSF sf;
98: PetscSFNode *remote;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102: PetscCallMPI(MPI_Comm_rank(comm, &rank));
103: PetscCallMPI(MPI_Comm_size(comm, &size));
104: /* get row map to determine where rows should be going */
105: PetscCall(MatGetLayouts(mat, &rmap, NULL));
106: /* retrieve IS data and put all together so that we
107: * can optimize communication
108: * */
109: PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110: tlength = 0;
111: for (PetscInt i = 0; i < nidx; i++) {
112: PetscCall(ISGetLocalSize(is[i], &length[i]));
113: tlength += length[i];
114: PetscCall(ISGetIndices(is[i], &indices[i]));
115: }
116: /* find these rows on remote processors */
117: PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
118: PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
119: nrrows = 0;
120: for (PetscInt i = 0; i < nidx; i++) {
121: length_i = length[i];
122: indices_i = indices[i];
123: for (PetscInt j = 0; j < length_i; j++) {
124: owner = -1;
125: PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
126: /* remote processors */
127: if (owner != rank) {
128: tosizes_temp[owner]++; /* number of rows to owner */
129: rrow_ranks[nrrows] = owner; /* processor */
130: rrow_isids[nrrows] = i; /* is id */
131: remoterows[nrrows++] = indices_i[j]; /* row */
132: }
133: }
134: PetscCall(ISRestoreIndices(is[i], &indices[i]));
135: }
136: PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
137: /* test if we need to exchange messages
138: * generally speaking, we do not need to exchange
139: * data when overlap is 1
140: * */
141: PetscCallMPI(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
142: /* we do not have any messages
143: * It usually corresponds to overlap 1
144: * */
145: if (!reducednrrows) {
146: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
147: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
148: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
149: PetscFunctionReturn(PETSC_SUCCESS);
150: }
151: nto = 0;
152: /* send sizes and ranks for building a two-sided communication */
153: for (PetscMPIInt i = 0; i < size; i++) {
154: if (tosizes_temp[i]) {
155: tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
156: tosizes_temp[i] = nto; /* a map from processor to index */
157: toranks[nto++] = i; /* MPI process */
158: }
159: }
160: PetscCall(PetscMalloc1(nto + 1, &toffsets));
161: toffsets[0] = 0;
162: for (PetscInt i = 0; i < nto; i++) {
163: toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */
164: tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */
165: }
166: /* send information to other processors */
167: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
168: nrecvrows = 0;
169: for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
170: PetscCall(PetscMalloc1(nrecvrows, &remote));
171: nrecvrows = 0;
172: for (PetscMPIInt i = 0; i < nfrom; i++) {
173: for (PetscInt j = 0; j < fromsizes[2 * i]; j++) {
174: remote[nrecvrows].rank = fromranks[i];
175: remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
176: }
177: }
178: PetscCall(PetscSFCreate(comm, &sf));
179: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
180: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
181: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
182: PetscCall(PetscSFSetFromOptions(sf));
183: /* message pair <no of is, row> */
184: PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
185: for (PetscInt i = 0; i < nrrows; i++) {
186: owner = rrow_ranks[i]; /* process */
187: j = tosizes_temp[owner]; /* index */
188: todata[toffsets[j]++] = rrow_isids[i];
189: todata[toffsets[j]++] = remoterows[i];
190: }
191: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
192: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
193: PetscCall(PetscFree(toffsets));
194: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
196: PetscCall(PetscSFDestroy(&sf));
197: /* send rows belonging to the remote so that then we could get the overlapping data back */
198: PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
199: PetscCall(PetscFree2(todata, fromdata));
200: PetscCall(PetscFree(fromsizes));
201: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
202: PetscCall(PetscFree(fromranks));
203: nrecvrows = 0;
204: for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
205: PetscCall(PetscCalloc1(nrecvrows, &todata));
206: PetscCall(PetscMalloc1(nrecvrows, &remote));
207: nrecvrows = 0;
208: for (PetscInt i = 0; i < nto; i++) {
209: for (PetscInt j = 0; j < tosizes[2 * i]; j++) {
210: remote[nrecvrows].rank = toranks[i];
211: remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
212: }
213: }
214: PetscCall(PetscSFCreate(comm, &sf));
215: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
216: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
217: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
218: PetscCall(PetscSFSetFromOptions(sf));
219: /* overlap communication and computation */
220: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
221: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
222: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
223: PetscCall(PetscSFDestroy(&sf));
224: PetscCall(PetscFree2(sbdata, sbsizes));
225: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
226: PetscCall(PetscFree(toranks));
227: PetscCall(PetscFree(tosizes));
228: PetscCall(PetscFree(todata));
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
233: {
234: PetscInt *isz, isz_i, i, j, is_id, data_size;
235: PetscInt col, lsize, max_lsize, *indices_temp, *indices_i;
236: const PetscInt *indices_i_temp;
237: MPI_Comm *iscomms;
239: PetscFunctionBegin;
240: max_lsize = 0;
241: PetscCall(PetscMalloc1(nidx, &isz));
242: for (i = 0; i < nidx; i++) {
243: PetscCall(ISGetLocalSize(is[i], &lsize));
244: max_lsize = lsize > max_lsize ? lsize : max_lsize;
245: isz[i] = lsize;
246: }
247: PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
248: for (i = 0; i < nidx; i++) {
249: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
250: PetscCall(ISGetIndices(is[i], &indices_i_temp));
251: PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
252: PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
253: PetscCall(ISDestroy(&is[i]));
254: }
255: /* retrieve information to get row id and its overlap */
256: for (i = 0; i < nrecvs;) {
257: is_id = recvdata[i++];
258: data_size = recvdata[i++];
259: indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
260: isz_i = isz[is_id];
261: for (j = 0; j < data_size; j++) {
262: col = recvdata[i++];
263: indices_i[isz_i++] = col;
264: }
265: isz[is_id] = isz_i;
266: }
267: /* remove duplicate entities */
268: for (i = 0; i < nidx; i++) {
269: indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
270: isz_i = isz[i];
271: PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
272: PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
273: PetscCall(PetscCommDestroy(&iscomms[i]));
274: }
275: PetscCall(PetscFree(isz));
276: PetscCall(PetscFree2(indices_temp, iscomms));
277: PetscFunctionReturn(PETSC_SUCCESS);
278: }
280: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
281: {
282: PetscLayout rmap, cmap;
283: PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
284: PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
285: PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
286: const PetscInt *gcols, *ai, *aj, *bi, *bj;
287: Mat amat, bmat;
288: PetscMPIInt rank;
289: PetscBool done;
290: MPI_Comm comm;
292: PetscFunctionBegin;
293: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
294: PetscCallMPI(MPI_Comm_rank(comm, &rank));
295: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
296: /* Even if the mat is symmetric, we still assume it is not symmetric */
297: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
298: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
299: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
300: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
301: /* total number of nonzero values is used to estimate the memory usage in the next step */
302: tnz = ai[an] + bi[bn];
303: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
304: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
305: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
306: /* to find the longest message */
307: max_fszs = 0;
308: for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
309: /* better way to estimate number of nonzero in the mat??? */
310: PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
311: for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
312: rows_pos = 0;
313: totalrows = 0;
314: for (i = 0; i < nfrom; i++) {
315: PetscCall(PetscArrayzero(rows_pos_i, nidx));
316: /* group data together */
317: for (j = 0; j < fromsizes[2 * i]; j += 2) {
318: is_id = fromrows[rows_pos++]; /* no of is */
319: rows_i = rows_data[is_id];
320: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
321: }
322: /* estimate a space to avoid multiple allocations */
323: for (j = 0; j < nidx; j++) {
324: indvc_ij = 0;
325: rows_i = rows_data[j];
326: for (l = 0; l < rows_pos_i[j]; l++) {
327: row = rows_i[l] - rstart;
328: start = ai[row];
329: end = ai[row + 1];
330: for (k = start; k < end; k++) { /* Amat */
331: col = aj[k] + cstart;
332: indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
333: }
334: start = bi[row];
335: end = bi[row + 1];
336: for (k = start; k < end; k++) { /* Bmat */
337: col = gcols[bj[k]];
338: indices_tmp[indvc_ij++] = col;
339: }
340: }
341: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
342: indv_counts[i * nidx + j] = indvc_ij;
343: totalrows += indvc_ij;
344: }
345: }
346: /* message triple <no of is, number of rows, rows> */
347: PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
348: totalrows = 0;
349: rows_pos = 0;
350: /* use this code again */
351: for (i = 0; i < nfrom; i++) {
352: PetscCall(PetscArrayzero(rows_pos_i, nidx));
353: for (j = 0; j < fromsizes[2 * i]; j += 2) {
354: is_id = fromrows[rows_pos++];
355: rows_i = rows_data[is_id];
356: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
357: }
358: /* add data */
359: for (j = 0; j < nidx; j++) {
360: if (!indv_counts[i * nidx + j]) continue;
361: indvc_ij = 0;
362: sbdata[totalrows++] = j;
363: sbdata[totalrows++] = indv_counts[i * nidx + j];
364: sbsizes[2 * i] += 2;
365: rows_i = rows_data[j];
366: for (l = 0; l < rows_pos_i[j]; l++) {
367: row = rows_i[l] - rstart;
368: start = ai[row];
369: end = ai[row + 1];
370: for (k = start; k < end; k++) { /* Amat */
371: col = aj[k] + cstart;
372: indices_tmp[indvc_ij++] = col;
373: }
374: start = bi[row];
375: end = bi[row + 1];
376: for (k = start; k < end; k++) { /* Bmat */
377: col = gcols[bj[k]];
378: indices_tmp[indvc_ij++] = col;
379: }
380: }
381: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
382: sbsizes[2 * i] += indvc_ij;
383: PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
384: totalrows += indvc_ij;
385: }
386: }
387: PetscCall(PetscMalloc1(nfrom + 1, &offsets));
388: offsets[0] = 0;
389: for (i = 0; i < nfrom; i++) {
390: offsets[i + 1] = offsets[i] + sbsizes[2 * i];
391: sbsizes[2 * i + 1] = offsets[i];
392: }
393: PetscCall(PetscFree(offsets));
394: if (sbrowsizes) *sbrowsizes = sbsizes;
395: if (sbrows) *sbrows = sbdata;
396: PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
397: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
398: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
399: PetscFunctionReturn(PETSC_SUCCESS);
400: }
402: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
403: {
404: const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
405: PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
406: PetscInt lsize, lsize_tmp;
407: PetscMPIInt rank, owner;
408: Mat amat, bmat;
409: PetscBool done;
410: PetscLayout cmap, rmap;
411: MPI_Comm comm;
413: PetscFunctionBegin;
414: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
415: PetscCallMPI(MPI_Comm_rank(comm, &rank));
416: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
417: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
418: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
419: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
420: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
421: /* is it a safe way to compute number of nonzero values ? */
422: tnz = ai[an] + bi[bn];
423: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
424: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
425: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
426: /* it is a better way to estimate memory than the old implementation
427: * where global size of matrix is used
428: * */
429: PetscCall(PetscMalloc1(tnz, &indices_temp));
430: for (i = 0; i < nidx; i++) {
431: MPI_Comm iscomm;
433: PetscCall(ISGetLocalSize(is[i], &lsize));
434: PetscCall(ISGetIndices(is[i], &indices));
435: lsize_tmp = 0;
436: for (j = 0; j < lsize; j++) {
437: owner = -1;
438: row = indices[j];
439: PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
440: if (owner != rank) continue;
441: /* local number */
442: row -= rstart;
443: start = ai[row];
444: end = ai[row + 1];
445: for (k = start; k < end; k++) { /* Amat */
446: col = aj[k] + cstart;
447: indices_temp[lsize_tmp++] = col;
448: }
449: start = bi[row];
450: end = bi[row + 1];
451: for (k = start; k < end; k++) { /* Bmat */
452: col = gcols[bj[k]];
453: indices_temp[lsize_tmp++] = col;
454: }
455: }
456: PetscCall(ISRestoreIndices(is[i], &indices));
457: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
458: PetscCall(ISDestroy(&is[i]));
459: PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
460: PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
461: PetscCall(PetscCommDestroy(&iscomm));
462: }
463: PetscCall(PetscFree(indices_temp));
464: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
465: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
466: PetscFunctionReturn(PETSC_SUCCESS);
467: }
469: /*
470: Sample message format:
471: If a processor A wants processor B to process some elements corresponding
472: to index sets is[1],is[5]
473: mesg [0] = 2 (no of index sets in the mesg)
474: -----------
475: mesg [1] = 1 => is[1]
476: mesg [2] = sizeof(is[1]);
477: -----------
478: mesg [3] = 5 => is[5]
479: mesg [4] = sizeof(is[5]);
480: -----------
481: mesg [5]
482: mesg [n] datas[1]
483: -----------
484: mesg[n+1]
485: mesg[m] data(is[5])
486: -----------
488: Notes:
489: nrqs - no of requests sent (or to be sent out)
490: nrqr - no of requests received (which have to be or which have been processed)
491: */
492: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
493: {
494: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
495: PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
496: const PetscInt **idx, *idx_i;
497: PetscInt *n, **data, len;
498: #if defined(PETSC_USE_CTABLE)
499: PetscHMapI *table_data, table_data_i;
500: PetscInt *tdata, tcount, tcount_max;
501: #else
502: PetscInt *data_i, *d_p;
503: #endif
504: PetscMPIInt size, rank, tag1, tag2, proc = 0, nrqs, *pa;
505: PetscInt M, k, **rbuf, row, msz, **outdat, **ptr, *isz1;
506: PetscInt *ctr, *tmp, *isz, **xdata, **rbuf2;
507: PetscBT *table;
508: MPI_Comm comm;
509: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
510: MPI_Status *recv_status;
511: MPI_Comm *iscomms;
512: char *t_p;
514: PetscFunctionBegin;
515: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
516: size = c->size;
517: rank = c->rank;
518: M = C->rmap->N;
520: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
521: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
523: PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
525: for (PetscInt i = 0; i < imax; i++) {
526: PetscCall(ISGetIndices(is[i], &idx[i]));
527: PetscCall(ISGetLocalSize(is[i], &n[i]));
528: }
530: /* evaluate communication - mesg to who,length of mesg, and buffer space
531: required. Based on this, buffers are allocated, and data copied into them */
532: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
533: for (PetscInt i = 0; i < imax; i++) {
534: PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
535: idx_i = idx[i];
536: len = n[i];
537: for (PetscInt j = 0; j < len; j++) {
538: row = idx_i[j];
539: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
540: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
541: w4[proc]++;
542: }
543: for (PetscMPIInt j = 0; j < size; j++) {
544: if (w4[j]) {
545: w1[j] += w4[j];
546: w3[j]++;
547: }
548: }
549: }
551: nrqs = 0; /* no of outgoing messages */
552: msz = 0; /* total mesg length (for all proc */
553: w1[rank] = 0; /* no mesg sent to intself */
554: w3[rank] = 0;
555: for (PetscMPIInt i = 0; i < size; i++) {
556: if (w1[i]) {
557: w2[i] = 1;
558: nrqs++;
559: } /* there exists a message to proc i */
560: }
561: /* pa - is list of processors to communicate with */
562: PetscCall(PetscMalloc1(nrqs, &pa));
563: for (PetscMPIInt i = 0, j = 0; i < size; i++) {
564: if (w1[i]) {
565: pa[j] = i;
566: j++;
567: }
568: }
570: /* Each message would have a header = 1 + 2*(no of IS) + data */
571: for (PetscMPIInt i = 0; i < nrqs; i++) {
572: PetscMPIInt j = pa[i];
573: w1[j] += w2[j] + 2 * w3[j];
574: msz += w1[j];
575: }
577: /* Determine the number of messages to expect, their lengths, from from-ids */
578: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
579: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
581: /* Now post the Irecvs corresponding to these messages */
582: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
584: /* Allocate Memory for outgoing messages */
585: PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
586: PetscCall(PetscArrayzero(outdat, size));
587: PetscCall(PetscArrayzero(ptr, size));
589: {
590: PetscInt *iptr = tmp, ict = 0;
591: for (PetscMPIInt i = 0; i < nrqs; i++) {
592: PetscMPIInt j = pa[i];
593: iptr += ict;
594: outdat[j] = iptr;
595: ict = w1[j];
596: }
597: }
599: /* Form the outgoing messages */
600: /* plug in the headers */
601: for (PetscMPIInt i = 0; i < nrqs; i++) {
602: PetscMPIInt j = pa[i];
603: outdat[j][0] = 0;
604: PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
605: ptr[j] = outdat[j] + 2 * w3[j] + 1;
606: }
608: /* Memory for doing local proc's work */
609: {
610: PetscInt M_BPB_imax = 0;
611: #if defined(PETSC_USE_CTABLE)
612: PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
613: PetscCall(PetscMalloc1(imax, &table_data));
614: for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
615: PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
616: for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
617: #else
618: PetscInt Mimax = 0;
619: PetscCall(PetscIntMultError(M, imax, &Mimax));
620: PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
621: PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
622: for (PetscInt i = 0; i < imax; i++) {
623: table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
624: data[i] = d_p + M * i;
625: }
626: #endif
627: }
629: /* Parse the IS and update local tables and the outgoing buf with the data */
630: {
631: PetscInt n_i, isz_i, *outdat_j, ctr_j;
632: PetscBT table_i;
634: for (PetscInt i = 0; i < imax; i++) {
635: PetscCall(PetscArrayzero(ctr, size));
636: n_i = n[i];
637: table_i = table[i];
638: idx_i = idx[i];
639: #if defined(PETSC_USE_CTABLE)
640: table_data_i = table_data[i];
641: #else
642: data_i = data[i];
643: #endif
644: isz_i = isz[i];
645: for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */
646: row = idx_i[j];
647: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
648: if (proc != rank) { /* copy to the outgoing buffer */
649: ctr[proc]++;
650: *ptr[proc] = row;
651: ptr[proc]++;
652: } else if (!PetscBTLookupSet(table_i, row)) {
653: #if defined(PETSC_USE_CTABLE)
654: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
655: #else
656: data_i[isz_i] = row; /* Update the local table */
657: #endif
658: isz_i++;
659: }
660: }
661: /* Update the headers for the current IS */
662: for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
663: if ((ctr_j = ctr[j])) {
664: outdat_j = outdat[j];
665: k = ++outdat_j[0];
666: outdat_j[2 * k] = ctr_j;
667: outdat_j[2 * k - 1] = i;
668: }
669: }
670: isz[i] = isz_i;
671: }
672: }
674: /* Now post the sends */
675: PetscCall(PetscMalloc1(nrqs, &s_waits1));
676: for (PetscMPIInt i = 0; i < nrqs; ++i) {
677: PetscMPIInt j = pa[i];
678: PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
679: }
681: /* No longer need the original indices */
682: PetscCall(PetscMalloc1(imax, &iscomms));
683: for (PetscInt i = 0; i < imax; ++i) {
684: PetscCall(ISRestoreIndices(is[i], idx + i));
685: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
686: }
687: PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
689: for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
691: /* Do Local work */
692: #if defined(PETSC_USE_CTABLE)
693: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
694: #else
695: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
696: #endif
698: /* Receive messages */
699: PetscCall(PetscMalloc1(nrqr, &recv_status));
700: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
701: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
703: /* Phase 1 sends are complete - deallocate buffers */
704: PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
705: PetscCall(PetscFree4(w1, w2, w3, w4));
707: PetscCall(PetscMalloc1(nrqr, &xdata));
708: PetscCall(PetscMalloc1(nrqr, &isz1));
709: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
710: PetscCall(PetscFree(rbuf[0]));
711: PetscCall(PetscFree(rbuf));
713: /* Send the data back */
714: /* Do a global reduction to know the buffer space req for incoming messages */
715: {
716: PetscMPIInt *rw1;
718: PetscCall(PetscCalloc1(size, &rw1));
719: for (PetscMPIInt i = 0; i < nrqr; ++i) {
720: proc = recv_status[i].MPI_SOURCE;
721: PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
722: PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc]));
723: }
724: PetscCall(PetscFree(onodes1));
725: PetscCall(PetscFree(olengths1));
727: /* Determine the number of messages to expect, their lengths, from from-ids */
728: PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
729: PetscCall(PetscFree(rw1));
730: }
731: /* Now post the Irecvs corresponding to these messages */
732: PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
734: /* Now post the sends */
735: PetscCall(PetscMalloc1(nrqr, &s_waits2));
736: for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); }
738: /* receive work done on other processors */
739: {
740: PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax;
741: PetscMPIInt idex;
742: PetscBT table_i;
744: for (PetscMPIInt i = 0; i < nrqs; ++i) {
745: PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
746: /* Process the message */
747: rbuf2_i = rbuf2[idex];
748: ct1 = 2 * rbuf2_i[0] + 1;
749: jmax = rbuf2[idex][0];
750: for (PetscInt j = 1; j <= jmax; j++) {
751: max = rbuf2_i[2 * j];
752: is_no = rbuf2_i[2 * j - 1];
753: isz_i = isz[is_no];
754: table_i = table[is_no];
755: #if defined(PETSC_USE_CTABLE)
756: table_data_i = table_data[is_no];
757: #else
758: data_i = data[is_no];
759: #endif
760: for (k = 0; k < max; k++, ct1++) {
761: row = rbuf2_i[ct1];
762: if (!PetscBTLookupSet(table_i, row)) {
763: #if defined(PETSC_USE_CTABLE)
764: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
765: #else
766: data_i[isz_i] = row;
767: #endif
768: isz_i++;
769: }
770: }
771: isz[is_no] = isz_i;
772: }
773: }
775: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
776: }
778: #if defined(PETSC_USE_CTABLE)
779: tcount_max = 0;
780: for (PetscInt i = 0; i < imax; ++i) {
781: table_data_i = table_data[i];
782: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
783: if (tcount_max < tcount) tcount_max = tcount;
784: }
785: PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
786: #endif
788: for (PetscInt i = 0; i < imax; ++i) {
789: #if defined(PETSC_USE_CTABLE)
790: PetscHashIter tpos;
791: PetscInt j;
793: table_data_i = table_data[i];
794: PetscHashIterBegin(table_data_i, tpos);
795: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
796: PetscHashIterGetKey(table_data_i, tpos, k);
797: PetscHashIterGetVal(table_data_i, tpos, j);
798: PetscHashIterNext(table_data_i, tpos);
799: tdata[--j] = --k;
800: }
801: PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
802: #else
803: PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
804: #endif
805: PetscCall(PetscCommDestroy(&iscomms[i]));
806: }
808: PetscCall(PetscFree(iscomms));
809: PetscCall(PetscFree(onodes2));
810: PetscCall(PetscFree(olengths2));
812: PetscCall(PetscFree(pa));
813: PetscCall(PetscFree(rbuf2[0]));
814: PetscCall(PetscFree(rbuf2));
815: PetscCall(PetscFree(s_waits1));
816: PetscCall(PetscFree(r_waits1));
817: PetscCall(PetscFree(s_waits2));
818: PetscCall(PetscFree(r_waits2));
819: PetscCall(PetscFree(recv_status));
820: if (xdata) {
821: PetscCall(PetscFree(xdata[0]));
822: PetscCall(PetscFree(xdata));
823: }
824: PetscCall(PetscFree(isz1));
825: #if defined(PETSC_USE_CTABLE)
826: for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
827: PetscCall(PetscFree(table_data));
828: PetscCall(PetscFree(tdata));
829: PetscCall(PetscFree4(table, data, isz, t_p));
830: #else
831: PetscCall(PetscFree5(table, data, isz, d_p, t_p));
832: #endif
833: PetscFunctionReturn(PETSC_SUCCESS);
834: }
836: /*
837: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
838: the work on the local processor.
840: Inputs:
841: C - MAT_MPIAIJ;
842: imax - total no of index sets processed at a time;
843: table - an array of char - size = m bits.
845: Output:
846: isz - array containing the count of the solution elements corresponding
847: to each index set;
848: data or table_data - pointer to the solutions
849: */
850: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
851: {
852: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
853: Mat A = c->A, B = c->B;
854: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
855: PetscInt start, end, val, max, rstart, cstart, *ai, *aj;
856: PetscInt *bi, *bj, *garray, i, j, k, row, isz_i;
857: PetscBT table_i;
858: #if defined(PETSC_USE_CTABLE)
859: PetscHMapI table_data_i;
860: PetscHashIter tpos;
861: PetscInt tcount, *tdata;
862: #else
863: PetscInt *data_i;
864: #endif
866: PetscFunctionBegin;
867: rstart = C->rmap->rstart;
868: cstart = C->cmap->rstart;
869: ai = a->i;
870: aj = a->j;
871: bi = b->i;
872: bj = b->j;
873: garray = c->garray;
875: for (i = 0; i < imax; i++) {
876: #if defined(PETSC_USE_CTABLE)
877: /* copy existing entries of table_data_i into tdata[] */
878: table_data_i = table_data[i];
879: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
880: PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
882: PetscCall(PetscMalloc1(tcount, &tdata));
883: PetscHashIterBegin(table_data_i, tpos);
884: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
885: PetscHashIterGetKey(table_data_i, tpos, row);
886: PetscHashIterGetVal(table_data_i, tpos, j);
887: PetscHashIterNext(table_data_i, tpos);
888: tdata[--j] = --row;
889: PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
890: }
891: #else
892: data_i = data[i];
893: #endif
894: table_i = table[i];
895: isz_i = isz[i];
896: max = isz[i];
898: for (j = 0; j < max; j++) {
899: #if defined(PETSC_USE_CTABLE)
900: row = tdata[j] - rstart;
901: #else
902: row = data_i[j] - rstart;
903: #endif
904: start = ai[row];
905: end = ai[row + 1];
906: for (k = start; k < end; k++) { /* Amat */
907: val = aj[k] + cstart;
908: if (!PetscBTLookupSet(table_i, val)) {
909: #if defined(PETSC_USE_CTABLE)
910: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
911: #else
912: data_i[isz_i] = val;
913: #endif
914: isz_i++;
915: }
916: }
917: start = bi[row];
918: end = bi[row + 1];
919: for (k = start; k < end; k++) { /* Bmat */
920: val = garray[bj[k]];
921: if (!PetscBTLookupSet(table_i, val)) {
922: #if defined(PETSC_USE_CTABLE)
923: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
924: #else
925: data_i[isz_i] = val;
926: #endif
927: isz_i++;
928: }
929: }
930: }
931: isz[i] = isz_i;
933: #if defined(PETSC_USE_CTABLE)
934: PetscCall(PetscFree(tdata));
935: #endif
936: }
937: PetscFunctionReturn(PETSC_SUCCESS);
938: }
940: /*
941: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
942: and return the output
944: Input:
945: C - the matrix
946: nrqr - no of messages being processed.
947: rbuf - an array of pointers to the received requests
949: Output:
950: xdata - array of messages to be sent back
951: isz1 - size of each message
953: For better efficiency perhaps we should malloc separately each xdata[i],
954: then if a remalloc is required we need only copy the data for that one row
955: rather than all previous rows as it is now where a single large chunk of
956: memory is used.
958: */
959: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
960: {
961: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
962: Mat A = c->A, B = c->B;
963: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
964: PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
965: PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
966: PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
967: PetscInt *rbuf_i, kmax, rbuf_0;
968: PetscBT xtable;
970: PetscFunctionBegin;
971: m = C->rmap->N;
972: rstart = C->rmap->rstart;
973: cstart = C->cmap->rstart;
974: ai = a->i;
975: aj = a->j;
976: bi = b->i;
977: bj = b->j;
978: garray = c->garray;
980: for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
981: rbuf_i = rbuf[i];
982: rbuf_0 = rbuf_i[0];
983: ct += rbuf_0;
984: for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
985: }
987: if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
988: else max1 = 1;
989: mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
990: if (nrqr) {
991: PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
992: ++no_malloc;
993: }
994: PetscCall(PetscBTCreate(m, &xtable));
995: PetscCall(PetscArrayzero(isz1, nrqr));
997: ct3 = 0;
998: for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
999: rbuf_i = rbuf[i];
1000: rbuf_0 = rbuf_i[0];
1001: ct1 = 2 * rbuf_0 + 1;
1002: ct2 = ct1;
1003: ct3 += ct1;
1004: for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1005: PetscCall(PetscBTMemzero(m, xtable));
1006: oct2 = ct2;
1007: kmax = rbuf_i[2 * j];
1008: for (k = 0; k < kmax; k++, ct1++) {
1009: row = rbuf_i[ct1];
1010: if (!PetscBTLookupSet(xtable, row)) {
1011: if (!(ct3 < mem_estimate)) {
1012: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1013: PetscCall(PetscMalloc1(new_estimate, &tmp));
1014: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1015: PetscCall(PetscFree(xdata[0]));
1016: xdata[0] = tmp;
1017: mem_estimate = new_estimate;
1018: ++no_malloc;
1019: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1020: }
1021: xdata[i][ct2++] = row;
1022: ct3++;
1023: }
1024: }
1025: for (k = oct2, max2 = ct2; k < max2; k++) {
1026: row = xdata[i][k] - rstart;
1027: start = ai[row];
1028: end = ai[row + 1];
1029: for (l = start; l < end; l++) {
1030: val = aj[l] + cstart;
1031: if (!PetscBTLookupSet(xtable, val)) {
1032: if (!(ct3 < mem_estimate)) {
1033: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1034: PetscCall(PetscMalloc1(new_estimate, &tmp));
1035: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1036: PetscCall(PetscFree(xdata[0]));
1037: xdata[0] = tmp;
1038: mem_estimate = new_estimate;
1039: ++no_malloc;
1040: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1041: }
1042: xdata[i][ct2++] = val;
1043: ct3++;
1044: }
1045: }
1046: start = bi[row];
1047: end = bi[row + 1];
1048: for (l = start; l < end; l++) {
1049: val = garray[bj[l]];
1050: if (!PetscBTLookupSet(xtable, val)) {
1051: if (!(ct3 < mem_estimate)) {
1052: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1053: PetscCall(PetscMalloc1(new_estimate, &tmp));
1054: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1055: PetscCall(PetscFree(xdata[0]));
1056: xdata[0] = tmp;
1057: mem_estimate = new_estimate;
1058: ++no_malloc;
1059: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1060: }
1061: xdata[i][ct2++] = val;
1062: ct3++;
1063: }
1064: }
1065: }
1066: /* Update the header*/
1067: xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1068: xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1069: }
1070: xdata[i][0] = rbuf_0;
1071: if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1072: isz1[i] = ct2; /* size of each message */
1073: }
1074: PetscCall(PetscBTDestroy(&xtable));
1075: PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1076: PetscFunctionReturn(PETSC_SUCCESS);
1077: }
1079: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1080: /*
1081: Every processor gets the entire matrix
1082: */
1083: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1084: {
1085: Mat B;
1086: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1087: Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1088: PetscMPIInt size, rank;
1089: PetscInt sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1090: PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1092: PetscFunctionBegin;
1093: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1094: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1095: if (scall == MAT_INITIAL_MATRIX) {
1096: /* Tell every processor the number of nonzeros per row */
1097: PetscCall(PetscCalloc1(A->rmap->N, &lens));
1098: for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1100: /* All MPI processes get the same matrix */
1101: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1103: /* Create the sequential matrix of the same type as the local block diagonal */
1104: PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1105: PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1106: PetscCall(MatSetBlockSizesFromMats(B, A, A));
1107: PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1108: PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1109: PetscCall(PetscCalloc1(2, Bin));
1110: **Bin = B;
1111: b = (Mat_SeqAIJ *)B->data;
1113: /* zero column space */
1114: nrecv = 0;
1115: for (PetscMPIInt i = 0; i < size; i++) {
1116: for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1117: }
1118: PetscCall(PetscArrayzero(b->j, nrecv));
1120: /* Copy my part of matrix column indices over */
1121: sendcount = ad->nz + bd->nz;
1122: jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1123: a_jsendbuf = ad->j;
1124: b_jsendbuf = bd->j;
1125: n = A->rmap->rend - A->rmap->rstart;
1126: cnt = 0;
1127: for (PetscInt i = 0; i < n; i++) {
1128: /* put in lower diagonal portion */
1129: m = bd->i[i + 1] - bd->i[i];
1130: while (m > 0) {
1131: /* is it above diagonal (in bd (compressed) numbering) */
1132: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1133: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134: m--;
1135: }
1137: /* put in diagonal portion */
1138: for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1140: /* put in upper diagonal portion */
1141: while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1142: }
1143: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1145: /* send column indices, b->j was zeroed */
1146: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1148: /* Assemble the matrix into useable form (numerical values not yet set) */
1149: /* set the b->ilen (length of each row) values */
1150: PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1151: /* set the b->i indices */
1152: b->i[0] = 0;
1153: for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1154: PetscCall(PetscFree(lens));
1155: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1156: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1158: } else {
1159: B = **Bin;
1160: b = (Mat_SeqAIJ *)B->data;
1161: }
1163: /* Copy my part of matrix numerical values into the values location */
1164: if (flag == MAT_GET_VALUES) {
1165: const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166: MatScalar *sendbuf;
1168: /* initialize b->a */
1169: PetscCall(PetscArrayzero(b->a, b->nz));
1171: PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172: PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173: sendcount = ad->nz + bd->nz;
1174: sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1175: a_sendbuf = ada;
1176: b_sendbuf = bda;
1177: b_sendj = bd->j;
1178: n = A->rmap->rend - A->rmap->rstart;
1179: cnt = 0;
1180: for (PetscInt i = 0; i < n; i++) {
1181: /* put in lower diagonal portion */
1182: m = bd->i[i + 1] - bd->i[i];
1183: while (m > 0) {
1184: /* is it above diagonal (in bd (compressed) numbering) */
1185: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186: sendbuf[cnt++] = *b_sendbuf++;
1187: m--;
1188: b_sendj++;
1189: }
1191: /* put in diagonal portion */
1192: for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1194: /* put in upper diagonal portion */
1195: while (m-- > 0) {
1196: sendbuf[cnt++] = *b_sendbuf++;
1197: b_sendj++;
1198: }
1199: }
1200: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1202: /* send values, b->a was zeroed */
1203: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1205: PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1206: PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1207: } /* endof (flag == MAT_GET_VALUES) */
1209: PetscCall(MatPropagateSymmetryOptions(A, B));
1210: PetscFunctionReturn(PETSC_SUCCESS);
1211: }
1213: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1214: {
1215: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
1216: Mat submat, A = c->A, B = c->B;
1217: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1218: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1219: PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1220: const PetscInt *icol, *irow;
1221: PetscInt nrow, ncol, start;
1222: PetscMPIInt rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa;
1223: PetscInt **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row;
1224: PetscInt msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr;
1225: PetscInt **rbuf3, **sbuf_aj, **rbuf2, max1, nnz;
1226: PetscInt *lens, rmax, ncols, *cols, Crow;
1227: #if defined(PETSC_USE_CTABLE)
1228: PetscHMapI cmap, rmap;
1229: PetscInt *cmap_loc, *rmap_loc;
1230: #else
1231: PetscInt *cmap, *rmap;
1232: #endif
1233: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt;
1234: PetscInt *cworkB, lwrite, *subcols, ib, jb;
1235: PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1236: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1237: MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4;
1238: MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1239: MPI_Status *r_status3 = NULL, *r_status4, *s_status4;
1240: MPI_Comm comm;
1241: PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1242: PetscMPIInt *onodes1, *olengths1, idex, end, *row2proc;
1243: Mat_SubSppt *smatis1;
1244: PetscBool isrowsorted, iscolsorted;
1246: PetscFunctionBegin;
1249: PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1250: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1251: PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1252: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1253: size = c->size;
1254: rank = c->rank;
1256: PetscCall(ISSorted(iscol[0], &iscolsorted));
1257: PetscCall(ISSorted(isrow[0], &isrowsorted));
1258: PetscCall(ISGetIndices(isrow[0], &irow));
1259: PetscCall(ISGetLocalSize(isrow[0], &nrow));
1260: if (allcolumns) {
1261: icol = NULL;
1262: ncol = C->cmap->N;
1263: } else {
1264: PetscCall(ISGetIndices(iscol[0], &icol));
1265: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1266: }
1268: if (scall == MAT_INITIAL_MATRIX) {
1269: PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1271: /* Get some new tags to keep the communication clean */
1272: tag1 = ((PetscObject)C)->tag;
1273: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1274: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1276: /* evaluate communication - mesg to who, length of mesg, and buffer space
1277: required. Based on this, buffers are allocated, and data copied into them */
1278: PetscCall(PetscCalloc2(size, &w1, size, &w2));
1279: PetscCall(PetscMalloc1(nrow, &row2proc));
1281: /* w1[proc] = num of rows owned by proc -- to be requested */
1282: proc = 0;
1283: nrqs = 0; /* num of outgoing messages */
1284: for (PetscInt j = 0; j < nrow; j++) {
1285: row = irow[j];
1286: if (!isrowsorted) proc = 0;
1287: while (row >= C->rmap->range[proc + 1]) proc++;
1288: w1[proc]++;
1289: row2proc[j] = proc; /* map row index to proc */
1291: if (proc != rank && !w2[proc]) {
1292: w2[proc] = 1;
1293: nrqs++;
1294: }
1295: }
1296: w1[rank] = 0; /* rows owned by self will not be requested */
1298: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1299: for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) {
1300: if (w1[proc]) pa[j++] = proc;
1301: }
1303: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1304: msz = 0; /* total mesg length (for all procs) */
1305: for (PetscMPIInt i = 0; i < nrqs; i++) {
1306: w1[pa[i]] += 3;
1307: msz += w1[pa[i]];
1308: }
1309: PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
1311: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1312: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1313: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1315: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1316: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1317: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1319: /* Now post the Irecvs corresponding to these messages */
1320: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1322: PetscCall(PetscFree(onodes1));
1323: PetscCall(PetscFree(olengths1));
1325: /* Allocate Memory for outgoing messages */
1326: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1327: PetscCall(PetscArrayzero(sbuf1, size));
1328: PetscCall(PetscArrayzero(ptr, size));
1330: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1331: iptr = tmp;
1332: for (PetscMPIInt i = 0; i < nrqs; i++) {
1333: sbuf1[pa[i]] = iptr;
1334: iptr += w1[pa[i]];
1335: }
1337: /* Form the outgoing messages */
1338: /* Initialize the header space */
1339: for (PetscMPIInt i = 0; i < nrqs; i++) {
1340: PetscCall(PetscArrayzero(sbuf1[pa[i]], 3));
1341: ptr[pa[i]] = sbuf1[pa[i]] + 3;
1342: }
1344: /* Parse the isrow and copy data into outbuf */
1345: PetscCall(PetscArrayzero(ctr, size));
1346: for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */
1347: if (row2proc[j] != rank) { /* copy to the outgoing buf */
1348: *ptr[row2proc[j]] = irow[j];
1349: ctr[row2proc[j]]++;
1350: ptr[row2proc[j]]++;
1351: }
1352: }
1354: /* Update the headers for the current IS */
1355: for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
1356: if ((ctr_j = ctr[j])) {
1357: sbuf1_j = sbuf1[j];
1358: k = ++sbuf1_j[0];
1359: sbuf1_j[2 * k] = ctr_j;
1360: sbuf1_j[2 * k - 1] = 0;
1361: }
1362: }
1364: /* Now post the sends */
1365: PetscCall(PetscMalloc1(nrqs, &s_waits1));
1366: for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Isend(sbuf1[pa[i]], w1[pa[i]], MPIU_INT, pa[i], tag1, comm, s_waits1 + i));
1368: /* Post Receives to capture the buffer size */
1369: PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1370: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1372: if (nrqs) rbuf2[0] = tmp + msz;
1373: for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1375: for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[pa[i]], MPIU_INT, pa[i], tag2, comm, r_waits2 + i));
1377: PetscCall(PetscFree2(w1, w2));
1379: /* Send to other procs the buf size they should allocate */
1380: /* Receive messages*/
1381: PetscCall(PetscMalloc1(nrqr, &r_status1));
1382: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1384: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1385: for (PetscMPIInt i = 0; i < nrqr; ++i) {
1386: req_size[i] = 0;
1387: rbuf1_i = rbuf1[i];
1388: start = 2 * rbuf1_i[0] + 1;
1389: PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1390: PetscCall(PetscMalloc1(end, &sbuf2[i]));
1391: sbuf2_i = sbuf2[i];
1392: for (PetscInt j = start; j < end; j++) {
1393: k = rbuf1_i[j] - rstart;
1394: ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1395: sbuf2_i[j] = ncols;
1396: req_size[i] += ncols;
1397: }
1398: req_source1[i] = r_status1[i].MPI_SOURCE;
1400: /* form the header */
1401: sbuf2_i[0] = req_size[i];
1402: for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1404: PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1405: }
1407: PetscCall(PetscFree(r_status1));
1408: PetscCall(PetscFree(r_waits1));
1410: /* rbuf2 is received, Post recv column indices a->j */
1411: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1413: PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1414: for (PetscMPIInt i = 0; i < nrqs; ++i) {
1415: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1416: req_source2[i] = r_status2[i].MPI_SOURCE;
1417: PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1418: }
1420: /* Wait on sends1 and sends2 */
1421: PetscCall(PetscMalloc1(nrqs, &s_status1));
1422: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1423: PetscCall(PetscFree(s_waits1));
1424: PetscCall(PetscFree(s_status1));
1426: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1427: PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1429: /* Now allocate sending buffers for a->j, and send them off */
1430: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1431: jcnt = 0;
1432: for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1433: if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
1434: for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1436: for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */
1437: rbuf1_i = rbuf1[i];
1438: sbuf_aj_i = sbuf_aj[i];
1439: ct1 = 2 * rbuf1_i[0] + 1;
1440: ct2 = 0;
1441: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1443: kmax = rbuf1[i][2];
1444: for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */
1445: row = rbuf1_i[ct1] - rstart;
1446: nzA = ai[row + 1] - ai[row];
1447: nzB = bi[row + 1] - bi[row];
1448: ncols = nzA + nzB;
1449: cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1450: cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1452: /* load the column indices for this row into cols*/
1453: cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1455: lwrite = 0;
1456: for (PetscInt l = 0; l < nzB; l++) {
1457: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1458: }
1459: for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1460: for (PetscInt l = 0; l < nzB; l++) {
1461: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1462: }
1464: ct2 += ncols;
1465: }
1466: PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1467: }
1469: /* create column map (cmap): global col of C -> local col of submat */
1470: #if defined(PETSC_USE_CTABLE)
1471: if (!allcolumns) {
1472: PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1473: PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1474: for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1475: if (icol[j] >= cstart && icol[j] < cend) {
1476: cmap_loc[icol[j] - cstart] = j + 1;
1477: } else { /* use PetscHMapI for non-local col indices */
1478: PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1479: }
1480: }
1481: } else {
1482: cmap = NULL;
1483: cmap_loc = NULL;
1484: }
1485: PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1486: #else
1487: if (!allcolumns) {
1488: PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1489: for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1490: } else {
1491: cmap = NULL;
1492: }
1493: #endif
1495: /* Create lens for MatSeqAIJSetPreallocation() */
1496: PetscCall(PetscCalloc1(nrow, &lens));
1498: /* Compute lens from local part of C */
1499: for (PetscInt j = 0; j < nrow; j++) {
1500: row = irow[j];
1501: if (row2proc[j] == rank) {
1502: /* diagonal part A = c->A */
1503: ncols = ai[row - rstart + 1] - ai[row - rstart];
1504: cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1505: if (!allcolumns) {
1506: for (PetscInt k = 0; k < ncols; k++) {
1507: #if defined(PETSC_USE_CTABLE)
1508: tcol = cmap_loc[cols[k]];
1509: #else
1510: tcol = cmap[cols[k] + cstart];
1511: #endif
1512: if (tcol) lens[j]++;
1513: }
1514: } else { /* allcolumns */
1515: lens[j] = ncols;
1516: }
1518: /* off-diagonal part B = c->B */
1519: ncols = bi[row - rstart + 1] - bi[row - rstart];
1520: cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1521: if (!allcolumns) {
1522: for (PetscInt k = 0; k < ncols; k++) {
1523: #if defined(PETSC_USE_CTABLE)
1524: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1525: #else
1526: tcol = cmap[bmap[cols[k]]];
1527: #endif
1528: if (tcol) lens[j]++;
1529: }
1530: } else { /* allcolumns */
1531: lens[j] += ncols;
1532: }
1533: }
1534: }
1536: /* Create row map (rmap): global row of C -> local row of submat */
1537: #if defined(PETSC_USE_CTABLE)
1538: PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1539: for (PetscInt j = 0; j < nrow; j++) {
1540: row = irow[j];
1541: if (row2proc[j] == rank) { /* a local row */
1542: rmap_loc[row - rstart] = j;
1543: } else {
1544: PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1545: }
1546: }
1547: #else
1548: PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1549: for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j;
1550: #endif
1552: /* Update lens from offproc data */
1553: /* recv a->j is done */
1554: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1555: for (PetscMPIInt i = 0; i < nrqs; i++) {
1556: sbuf1_i = sbuf1[pa[i]];
1557: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1558: ct1 = 2 + 1;
1559: ct2 = 0;
1560: rbuf2_i = rbuf2[i]; /* received length of C->j */
1561: rbuf3_i = rbuf3[i]; /* received C->j */
1563: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1564: max1 = sbuf1_i[2];
1565: for (PetscInt k = 0; k < max1; k++, ct1++) {
1566: #if defined(PETSC_USE_CTABLE)
1567: PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1568: row--;
1569: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1570: #else
1571: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1572: #endif
1573: /* Now, store row index of submat in sbuf1_i[ct1] */
1574: sbuf1_i[ct1] = row;
1576: nnz = rbuf2_i[ct1];
1577: if (!allcolumns) {
1578: for (PetscMPIInt l = 0; l < nnz; l++, ct2++) {
1579: #if defined(PETSC_USE_CTABLE)
1580: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1581: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1582: } else {
1583: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1584: }
1585: #else
1586: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1587: #endif
1588: if (tcol) lens[row]++;
1589: }
1590: } else { /* allcolumns */
1591: lens[row] += nnz;
1592: }
1593: }
1594: }
1595: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1596: PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1598: /* Create the submatrices */
1599: PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1600: PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1602: PetscCall(ISGetBlockSize(isrow[0], &ib));
1603: PetscCall(ISGetBlockSize(iscol[0], &jb));
1604: if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb));
1605: PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1606: PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1608: /* create struct Mat_SubSppt and attached it to submat */
1609: PetscCall(PetscNew(&smatis1));
1610: subc = (Mat_SeqAIJ *)submat->data;
1611: subc->submatis1 = smatis1;
1613: smatis1->id = 0;
1614: smatis1->nrqs = nrqs;
1615: smatis1->nrqr = nrqr;
1616: smatis1->rbuf1 = rbuf1;
1617: smatis1->rbuf2 = rbuf2;
1618: smatis1->rbuf3 = rbuf3;
1619: smatis1->sbuf2 = sbuf2;
1620: smatis1->req_source2 = req_source2;
1622: smatis1->sbuf1 = sbuf1;
1623: smatis1->ptr = ptr;
1624: smatis1->tmp = tmp;
1625: smatis1->ctr = ctr;
1627: smatis1->pa = pa;
1628: smatis1->req_size = req_size;
1629: smatis1->req_source1 = req_source1;
1631: smatis1->allcolumns = allcolumns;
1632: smatis1->singleis = PETSC_TRUE;
1633: smatis1->row2proc = row2proc;
1634: smatis1->rmap = rmap;
1635: smatis1->cmap = cmap;
1636: #if defined(PETSC_USE_CTABLE)
1637: smatis1->rmap_loc = rmap_loc;
1638: smatis1->cmap_loc = cmap_loc;
1639: #endif
1641: smatis1->destroy = submat->ops->destroy;
1642: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1643: submat->factortype = C->factortype;
1645: /* compute rmax */
1646: rmax = 0;
1647: for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1649: } else { /* scall == MAT_REUSE_MATRIX */
1650: submat = submats[0];
1651: PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1653: subc = (Mat_SeqAIJ *)submat->data;
1654: rmax = subc->rmax;
1655: smatis1 = subc->submatis1;
1656: nrqs = smatis1->nrqs;
1657: nrqr = smatis1->nrqr;
1658: rbuf1 = smatis1->rbuf1;
1659: rbuf2 = smatis1->rbuf2;
1660: rbuf3 = smatis1->rbuf3;
1661: req_source2 = smatis1->req_source2;
1663: sbuf1 = smatis1->sbuf1;
1664: sbuf2 = smatis1->sbuf2;
1665: ptr = smatis1->ptr;
1666: tmp = smatis1->tmp;
1667: ctr = smatis1->ctr;
1669: pa = smatis1->pa;
1670: req_size = smatis1->req_size;
1671: req_source1 = smatis1->req_source1;
1673: allcolumns = smatis1->allcolumns;
1674: row2proc = smatis1->row2proc;
1675: rmap = smatis1->rmap;
1676: cmap = smatis1->cmap;
1677: #if defined(PETSC_USE_CTABLE)
1678: rmap_loc = smatis1->rmap_loc;
1679: cmap_loc = smatis1->cmap_loc;
1680: #endif
1681: }
1683: /* Post recv matrix values */
1684: PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1685: PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1686: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1687: for (PetscMPIInt i = 0; i < nrqs; ++i) {
1688: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1689: PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1690: }
1692: /* Allocate sending buffers for a->a, and send them off */
1693: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1694: jcnt = 0;
1695: for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1696: if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
1697: for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1699: for (PetscMPIInt i = 0; i < nrqr; i++) {
1700: rbuf1_i = rbuf1[i];
1701: sbuf_aa_i = sbuf_aa[i];
1702: ct1 = 2 * rbuf1_i[0] + 1;
1703: ct2 = 0;
1704: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1706: kmax = rbuf1_i[2];
1707: for (PetscInt k = 0; k < kmax; k++, ct1++) {
1708: row = rbuf1_i[ct1] - rstart;
1709: nzA = ai[row + 1] - ai[row];
1710: nzB = bi[row + 1] - bi[row];
1711: ncols = nzA + nzB;
1712: cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1713: vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1714: vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1716: /* load the column values for this row into vals*/
1717: vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1719: lwrite = 0;
1720: for (PetscInt l = 0; l < nzB; l++) {
1721: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1722: }
1723: for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1724: for (PetscInt l = 0; l < nzB; l++) {
1725: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1726: }
1728: ct2 += ncols;
1729: }
1730: PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1731: }
1733: /* Assemble submat */
1734: /* First assemble the local rows */
1735: for (PetscInt j = 0; j < nrow; j++) {
1736: row = irow[j];
1737: if (row2proc[j] == rank) {
1738: Crow = row - rstart; /* local row index of C */
1739: #if defined(PETSC_USE_CTABLE)
1740: row = rmap_loc[Crow]; /* row index of submat */
1741: #else
1742: row = rmap[row];
1743: #endif
1745: if (allcolumns) {
1746: PetscInt ncol = 0;
1748: /* diagonal part A = c->A */
1749: ncols = ai[Crow + 1] - ai[Crow];
1750: cols = PetscSafePointerPlusOffset(aj, ai[Crow]);
1751: vals = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1752: for (PetscInt k = 0; k < ncols; k++) {
1753: subcols[ncol] = cols[k] + cstart;
1754: subvals[ncol++] = vals[k];
1755: }
1757: /* off-diagonal part B = c->B */
1758: ncols = bi[Crow + 1] - bi[Crow];
1759: cols = PetscSafePointerPlusOffset(bj, bi[Crow]);
1760: vals = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1761: for (PetscInt k = 0; k < ncols; k++) {
1762: subcols[ncol] = bmap[cols[k]];
1763: subvals[ncol++] = vals[k];
1764: }
1766: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1768: } else { /* !allcolumns */
1769: PetscInt ncol = 0;
1771: #if defined(PETSC_USE_CTABLE)
1772: /* diagonal part A = c->A */
1773: ncols = ai[Crow + 1] - ai[Crow];
1774: cols = PetscSafePointerPlusOffset(aj, ai[Crow]);
1775: vals = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1776: for (PetscInt k = 0; k < ncols; k++) {
1777: tcol = cmap_loc[cols[k]];
1778: if (tcol) {
1779: subcols[ncol] = --tcol;
1780: subvals[ncol++] = vals[k];
1781: }
1782: }
1784: /* off-diagonal part B = c->B */
1785: ncols = bi[Crow + 1] - bi[Crow];
1786: cols = PetscSafePointerPlusOffset(bj, bi[Crow]);
1787: vals = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1788: for (PetscInt k = 0; k < ncols; k++) {
1789: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1790: if (tcol) {
1791: subcols[ncol] = --tcol;
1792: subvals[ncol++] = vals[k];
1793: }
1794: }
1795: #else
1796: /* diagonal part A = c->A */
1797: ncols = ai[Crow + 1] - ai[Crow];
1798: cols = aj + ai[Crow];
1799: vals = a_a + ai[Crow];
1800: for (PetscInt k = 0; k < ncols; k++) {
1801: tcol = cmap[cols[k] + cstart];
1802: if (tcol) {
1803: subcols[ncol] = --tcol;
1804: subvals[ncol++] = vals[k];
1805: }
1806: }
1808: /* off-diagonal part B = c->B */
1809: ncols = bi[Crow + 1] - bi[Crow];
1810: cols = bj + bi[Crow];
1811: vals = b_a + bi[Crow];
1812: for (PetscInt k = 0; k < ncols; k++) {
1813: tcol = cmap[bmap[cols[k]]];
1814: if (tcol) {
1815: subcols[ncol] = --tcol;
1816: subvals[ncol++] = vals[k];
1817: }
1818: }
1819: #endif
1820: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1821: }
1822: }
1823: }
1825: /* Now assemble the off-proc rows */
1826: for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */
1827: /* recv values from other processes */
1828: PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1829: sbuf1_i = sbuf1[pa[idex]];
1830: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1831: ct1 = 2 + 1;
1832: ct2 = 0; /* count of received C->j */
1833: ct3 = 0; /* count of received C->j that will be inserted into submat */
1834: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1835: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1836: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1838: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1839: max1 = sbuf1_i[2]; /* num of rows */
1840: for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */
1841: row = sbuf1_i[ct1]; /* row index of submat */
1842: if (!allcolumns) {
1843: idex = 0;
1844: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1845: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1846: for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1847: #if defined(PETSC_USE_CTABLE)
1848: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1849: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1850: } else {
1851: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1852: }
1853: #else
1854: tcol = cmap[rbuf3_i[ct2]];
1855: #endif
1856: if (tcol) {
1857: subcols[idex] = --tcol; /* may not be sorted */
1858: subvals[idex++] = rbuf4_i[ct2];
1860: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1861: For reuse, we replace received C->j with index that should be inserted to submat */
1862: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1863: }
1864: }
1865: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1866: } else { /* scall == MAT_REUSE_MATRIX */
1867: submat = submats[0];
1868: subc = (Mat_SeqAIJ *)submat->data;
1870: nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1871: for (PetscInt l = 0; l < nnz; l++) {
1872: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1873: subvals[idex++] = rbuf4_i[ct2];
1874: }
1876: bj = subc->j + subc->i[row]; /* sorted column indices */
1877: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1878: }
1879: } else { /* allcolumns */
1880: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1881: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1882: ct2 += nnz;
1883: }
1884: }
1885: }
1887: /* sending a->a are done */
1888: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1889: PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1891: PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1892: PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1893: submats[0] = submat;
1895: /* Restore the indices */
1896: PetscCall(ISRestoreIndices(isrow[0], &irow));
1897: if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1899: /* Destroy allocated memory */
1900: for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1901: PetscCall(PetscFree3(rbuf4, subcols, subvals));
1902: if (sbuf_aa) {
1903: PetscCall(PetscFree(sbuf_aa[0]));
1904: PetscCall(PetscFree(sbuf_aa));
1905: }
1907: if (scall == MAT_INITIAL_MATRIX) {
1908: PetscCall(PetscFree(lens));
1909: if (sbuf_aj) {
1910: PetscCall(PetscFree(sbuf_aj[0]));
1911: PetscCall(PetscFree(sbuf_aj));
1912: }
1913: }
1914: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1915: PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1916: PetscFunctionReturn(PETSC_SUCCESS);
1917: }
1919: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1920: {
1921: PetscInt ncol;
1922: PetscBool colflag, allcolumns = PETSC_FALSE;
1924: PetscFunctionBegin;
1925: /* Allocate memory to hold all the submatrices */
1926: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1928: /* Check for special case: each processor gets entire matrix columns */
1929: PetscCall(ISIdentity(iscol[0], &colflag));
1930: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1931: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1933: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1934: PetscFunctionReturn(PETSC_SUCCESS);
1935: }
1937: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1938: {
1939: PetscInt nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2];
1940: PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE;
1941: Mat_SeqAIJ *subc;
1942: Mat_SubSppt *smat;
1944: PetscFunctionBegin;
1945: /* Check for special case: each processor has a single IS */
1946: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1947: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1948: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1949: PetscFunctionReturn(PETSC_SUCCESS);
1950: }
1952: /* Collect global wantallmatrix and nstages */
1953: if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1954: else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1955: if (!nmax) nmax = 1;
1957: if (scall == MAT_INITIAL_MATRIX) {
1958: /* Collect global wantallmatrix and nstages */
1959: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1960: PetscCall(ISIdentity(*isrow, &rowflag));
1961: PetscCall(ISIdentity(*iscol, &colflag));
1962: PetscCall(ISGetLocalSize(*isrow, &nrow));
1963: PetscCall(ISGetLocalSize(*iscol, &ncol));
1964: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1965: wantallmatrix = PETSC_TRUE;
1967: PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1968: }
1969: }
1971: /* Determine the number of stages through which submatrices are done
1972: Each stage will extract nmax submatrices.
1973: nmax is determined by the matrix column dimension.
1974: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1975: */
1976: nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1978: in[0] = -1 * (PetscInt)wantallmatrix;
1979: in[1] = nstages;
1980: PetscCallMPI(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1981: wantallmatrix = (PetscBool)(-out[0]);
1982: nstages = out[1]; /* Make sure every processor loops through the global nstages */
1984: } else { /* MAT_REUSE_MATRIX */
1985: if (ismax) {
1986: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1987: smat = subc->submatis1;
1988: } else { /* (*submat)[0] is a dummy matrix */
1989: smat = (Mat_SubSppt *)(*submat)[0]->data;
1990: }
1991: if (!smat) {
1992: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1993: wantallmatrix = PETSC_TRUE;
1994: } else if (smat->singleis) {
1995: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1996: PetscFunctionReturn(PETSC_SUCCESS);
1997: } else {
1998: nstages = smat->nstages;
1999: }
2000: }
2002: if (wantallmatrix) {
2003: PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2004: PetscFunctionReturn(PETSC_SUCCESS);
2005: }
2007: /* Allocate memory to hold all the submatrices and dummy submatrices */
2008: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2010: for (PetscInt i = 0, pos = 0; i < nstages; i++) {
2011: if (pos + nmax <= ismax) max_no = nmax;
2012: else if (pos >= ismax) max_no = 0;
2013: else max_no = ismax - pos;
2015: PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2016: if (!max_no) {
2017: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2018: smat = (Mat_SubSppt *)(*submat)[pos]->data;
2019: smat->nstages = nstages;
2020: }
2021: pos++; /* advance to next dummy matrix if any */
2022: } else pos += max_no;
2023: }
2025: if (ismax && scall == MAT_INITIAL_MATRIX) {
2026: /* save nstages for reuse */
2027: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2028: smat = subc->submatis1;
2029: smat->nstages = nstages;
2030: }
2031: PetscFunctionReturn(PETSC_SUCCESS);
2032: }
2034: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2035: {
2036: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
2037: Mat A = c->A;
2038: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2039: const PetscInt **icol, **irow;
2040: PetscInt *nrow, *ncol, start;
2041: PetscMPIInt nrqs = 0, *pa, proc = -1;
2042: PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2;
2043: PetscInt **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row;
2044: PetscInt msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol;
2045: PetscInt **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2046: PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2047: #if defined(PETSC_USE_CTABLE)
2048: PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2049: #else
2050: PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2051: #endif
2052: const PetscInt *irow_i;
2053: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2054: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2055: MPI_Request *r_waits4, *s_waits3, *s_waits4;
2056: MPI_Comm comm;
2057: PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2058: PetscMPIInt *onodes1, *olengths1, end, **row2proc, *row2proc_i;
2059: PetscInt ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2060: Mat_SubSppt *smat_i;
2061: PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2062: PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt;
2064: PetscFunctionBegin;
2065: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2066: size = c->size;
2067: rank = c->rank;
2069: PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2070: PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2072: for (PetscInt i = 0; i < ismax; i++) {
2073: PetscCall(ISSorted(iscol[i], &issorted[i]));
2074: if (!issorted[i]) iscsorted = issorted[i];
2076: PetscCall(ISSorted(isrow[i], &issorted[i]));
2078: PetscCall(ISGetIndices(isrow[i], &irow[i]));
2079: PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2081: /* Check for special case: allcolumn */
2082: PetscCall(ISIdentity(iscol[i], &colflag));
2083: PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2084: if (colflag && ncol[i] == C->cmap->N) {
2085: allcolumns[i] = PETSC_TRUE;
2086: icol[i] = NULL;
2087: } else {
2088: allcolumns[i] = PETSC_FALSE;
2089: PetscCall(ISGetIndices(iscol[i], &icol[i]));
2090: }
2091: }
2093: if (scall == MAT_REUSE_MATRIX) {
2094: /* Assumes new rows are same length as the old rows */
2095: for (PetscInt i = 0; i < ismax; i++) {
2096: PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2097: subc = (Mat_SeqAIJ *)submats[i]->data;
2098: PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2100: /* Initial matrix as if empty */
2101: PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2103: smat_i = subc->submatis1;
2105: nrqs = smat_i->nrqs;
2106: nrqr = smat_i->nrqr;
2107: rbuf1 = smat_i->rbuf1;
2108: rbuf2 = smat_i->rbuf2;
2109: rbuf3 = smat_i->rbuf3;
2110: req_source2 = smat_i->req_source2;
2112: sbuf1 = smat_i->sbuf1;
2113: sbuf2 = smat_i->sbuf2;
2114: ptr = smat_i->ptr;
2115: tmp = smat_i->tmp;
2116: ctr = smat_i->ctr;
2118: pa = smat_i->pa;
2119: req_size = smat_i->req_size;
2120: req_source1 = smat_i->req_source1;
2122: allcolumns[i] = smat_i->allcolumns;
2123: row2proc[i] = smat_i->row2proc;
2124: rmap[i] = smat_i->rmap;
2125: cmap[i] = smat_i->cmap;
2126: }
2128: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2129: PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2130: smat_i = (Mat_SubSppt *)submats[0]->data;
2132: nrqs = smat_i->nrqs;
2133: nrqr = smat_i->nrqr;
2134: rbuf1 = smat_i->rbuf1;
2135: rbuf2 = smat_i->rbuf2;
2136: rbuf3 = smat_i->rbuf3;
2137: req_source2 = smat_i->req_source2;
2139: sbuf1 = smat_i->sbuf1;
2140: sbuf2 = smat_i->sbuf2;
2141: ptr = smat_i->ptr;
2142: tmp = smat_i->tmp;
2143: ctr = smat_i->ctr;
2145: pa = smat_i->pa;
2146: req_size = smat_i->req_size;
2147: req_source1 = smat_i->req_source1;
2149: allcolumns[0] = PETSC_FALSE;
2150: }
2151: } else { /* scall == MAT_INITIAL_MATRIX */
2152: /* Get some new tags to keep the communication clean */
2153: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2154: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2156: /* evaluate communication - mesg to who, length of mesg, and buffer space
2157: required. Based on this, buffers are allocated, and data copied into them*/
2158: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2160: for (PetscInt i = 0; i < ismax; i++) {
2161: jmax = nrow[i];
2162: irow_i = irow[i];
2164: PetscCall(PetscMalloc1(jmax, &row2proc_i));
2165: row2proc[i] = row2proc_i;
2167: if (issorted[i]) proc = 0;
2168: for (PetscInt j = 0; j < jmax; j++) {
2169: if (!issorted[i]) proc = 0;
2170: row = irow_i[j];
2171: while (row >= C->rmap->range[proc + 1]) proc++;
2172: w4[proc]++;
2173: row2proc_i[j] = proc; /* map row index to proc */
2174: }
2175: for (PetscMPIInt j = 0; j < size; j++) {
2176: if (w4[j]) {
2177: w1[j] += w4[j];
2178: w3[j]++;
2179: w4[j] = 0;
2180: }
2181: }
2182: }
2184: nrqs = 0; /* no of outgoing messages */
2185: msz = 0; /* total mesg length (for all procs) */
2186: w1[rank] = 0; /* no mesg sent to self */
2187: w3[rank] = 0;
2188: for (PetscMPIInt i = 0; i < size; i++) {
2189: if (w1[i]) {
2190: w2[i] = 1;
2191: nrqs++;
2192: } /* there exists a message to proc i */
2193: }
2194: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2195: for (PetscMPIInt i = 0, j = 0; i < size; i++) {
2196: if (w1[i]) {
2197: pa[j] = i;
2198: j++;
2199: }
2200: }
2202: /* Each message would have a header = 1 + 2*(no of IS) + data */
2203: for (PetscMPIInt i = 0; i < nrqs; i++) {
2204: PetscMPIInt j = pa[i];
2205: w1[j] += w2[j] + 2 * w3[j];
2206: msz += w1[j];
2207: }
2208: PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));
2210: /* Determine the number of messages to expect, their lengths, from from-ids */
2211: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2212: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2214: /* Now post the Irecvs corresponding to these messages */
2215: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2216: PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2218: /* Allocate Memory for outgoing messages */
2219: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2220: PetscCall(PetscArrayzero(sbuf1, size));
2221: PetscCall(PetscArrayzero(ptr, size));
2223: {
2224: PetscInt *iptr = tmp;
2225: k = 0;
2226: for (PetscMPIInt i = 0; i < nrqs; i++) {
2227: PetscMPIInt j = pa[i];
2228: iptr += k;
2229: sbuf1[j] = iptr;
2230: k = w1[j];
2231: }
2232: }
2234: /* Form the outgoing messages. Initialize the header space */
2235: for (PetscMPIInt i = 0; i < nrqs; i++) {
2236: PetscMPIInt j = pa[i];
2237: sbuf1[j][0] = 0;
2238: PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2239: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2240: }
2242: /* Parse the isrow and copy data into outbuf */
2243: for (PetscInt i = 0; i < ismax; i++) {
2244: row2proc_i = row2proc[i];
2245: PetscCall(PetscArrayzero(ctr, size));
2246: irow_i = irow[i];
2247: jmax = nrow[i];
2248: for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */
2249: proc = row2proc_i[j];
2250: if (proc != rank) { /* copy to the outgoing buf */
2251: ctr[proc]++;
2252: *ptr[proc] = irow_i[j];
2253: ptr[proc]++;
2254: }
2255: }
2256: /* Update the headers for the current IS */
2257: for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
2258: if ((ctr_j = ctr[j])) {
2259: sbuf1_j = sbuf1[j];
2260: k = ++sbuf1_j[0];
2261: sbuf1_j[2 * k] = ctr_j;
2262: sbuf1_j[2 * k - 1] = i;
2263: }
2264: }
2265: }
2267: /* Now post the sends */
2268: PetscCall(PetscMalloc1(nrqs, &s_waits1));
2269: for (PetscMPIInt i = 0; i < nrqs; ++i) {
2270: PetscMPIInt j = pa[i];
2271: PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2272: }
2274: /* Post Receives to capture the buffer size */
2275: PetscCall(PetscMalloc1(nrqs, &r_waits2));
2276: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2277: if (nrqs) rbuf2[0] = tmp + msz;
2278: for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2279: for (PetscMPIInt i = 0; i < nrqs; ++i) {
2280: PetscMPIInt j = pa[i];
2281: PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2282: }
2284: /* Send to other procs the buf size they should allocate */
2285: /* Receive messages*/
2286: PetscCall(PetscMalloc1(nrqr, &s_waits2));
2287: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2288: {
2289: PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2290: PetscInt *sbuf2_i;
2292: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2293: for (PetscMPIInt i = 0; i < nrqr; ++i) {
2294: req_size[i] = 0;
2295: rbuf1_i = rbuf1[i];
2296: start = 2 * rbuf1_i[0] + 1;
2297: end = olengths1[i];
2298: PetscCall(PetscMalloc1(end, &sbuf2[i]));
2299: sbuf2_i = sbuf2[i];
2300: for (PetscInt j = start; j < end; j++) {
2301: id = rbuf1_i[j] - rstart;
2302: ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2303: sbuf2_i[j] = ncols;
2304: req_size[i] += ncols;
2305: }
2306: req_source1[i] = onodes1[i];
2307: /* form the header */
2308: sbuf2_i[0] = req_size[i];
2309: for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2311: PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2312: }
2313: }
2315: PetscCall(PetscFree(onodes1));
2316: PetscCall(PetscFree(olengths1));
2317: PetscCall(PetscFree(r_waits1));
2318: PetscCall(PetscFree4(w1, w2, w3, w4));
2320: /* Receive messages*/
2321: PetscCall(PetscMalloc1(nrqs, &r_waits3));
2322: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2323: for (PetscMPIInt i = 0; i < nrqs; ++i) {
2324: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2325: req_source2[i] = pa[i];
2326: PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2327: }
2328: PetscCall(PetscFree(r_waits2));
2330: /* Wait on sends1 and sends2 */
2331: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2332: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2333: PetscCall(PetscFree(s_waits1));
2334: PetscCall(PetscFree(s_waits2));
2336: /* Now allocate sending buffers for a->j, and send them off */
2337: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2338: jcnt = 0;
2339: for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2340: if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
2341: for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2343: PetscCall(PetscMalloc1(nrqr, &s_waits3));
2344: {
2345: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2346: PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2347: PetscInt cend = C->cmap->rend;
2348: PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2350: for (PetscMPIInt i = 0; i < nrqr; i++) {
2351: rbuf1_i = rbuf1[i];
2352: sbuf_aj_i = sbuf_aj[i];
2353: ct1 = 2 * rbuf1_i[0] + 1;
2354: ct2 = 0;
2355: for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2356: kmax = rbuf1[i][2 * j];
2357: for (PetscInt k = 0; k < kmax; k++, ct1++) {
2358: row = rbuf1_i[ct1] - rstart;
2359: nzA = a_i[row + 1] - a_i[row];
2360: nzB = b_i[row + 1] - b_i[row];
2361: ncols = nzA + nzB;
2362: cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2363: cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2365: /* load the column indices for this row into cols */
2366: cols = sbuf_aj_i + ct2;
2368: lwrite = 0;
2369: for (PetscInt l = 0; l < nzB; l++) {
2370: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2371: }
2372: for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2373: for (PetscInt l = 0; l < nzB; l++) {
2374: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2375: }
2377: ct2 += ncols;
2378: }
2379: }
2380: PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2381: }
2382: }
2384: /* create col map: global col of C -> local col of submatrices */
2385: {
2386: const PetscInt *icol_i;
2387: #if defined(PETSC_USE_CTABLE)
2388: for (PetscInt i = 0; i < ismax; i++) {
2389: if (!allcolumns[i]) {
2390: PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2392: jmax = ncol[i];
2393: icol_i = icol[i];
2394: cmap_i = cmap[i];
2395: for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2396: } else cmap[i] = NULL;
2397: }
2398: #else
2399: for (PetscInt i = 0; i < ismax; i++) {
2400: if (!allcolumns[i]) {
2401: PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2402: jmax = ncol[i];
2403: icol_i = icol[i];
2404: cmap_i = cmap[i];
2405: for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2406: } else cmap[i] = NULL;
2407: }
2408: #endif
2409: }
2411: /* Create lens which is required for MatCreate... */
2412: jcnt = 0;
2413: for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i];
2414: PetscCall(PetscMalloc1(ismax, &lens));
2416: if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0]));
2417: for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2419: /* Update lens from local data */
2420: for (PetscInt i = 0; i < ismax; i++) {
2421: row2proc_i = row2proc[i];
2422: jmax = nrow[i];
2423: if (!allcolumns[i]) cmap_i = cmap[i];
2424: irow_i = irow[i];
2425: lens_i = lens[i];
2426: for (PetscInt j = 0; j < jmax; j++) {
2427: row = irow_i[j];
2428: proc = row2proc_i[j];
2429: if (proc == rank) {
2430: PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2431: if (!allcolumns[i]) {
2432: for (PetscInt k = 0; k < ncols; k++) {
2433: #if defined(PETSC_USE_CTABLE)
2434: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2435: #else
2436: tcol = cmap_i[cols[k]];
2437: #endif
2438: if (tcol) lens_i[j]++;
2439: }
2440: } else { /* allcolumns */
2441: lens_i[j] = ncols;
2442: }
2443: PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2444: }
2445: }
2446: }
2448: /* Create row map: global row of C -> local row of submatrices */
2449: #if defined(PETSC_USE_CTABLE)
2450: for (PetscInt i = 0; i < ismax; i++) {
2451: PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2452: irow_i = irow[i];
2453: jmax = nrow[i];
2454: for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2455: }
2456: #else
2457: for (PetscInt i = 0; i < ismax; i++) {
2458: PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2459: rmap_i = rmap[i];
2460: irow_i = irow[i];
2461: jmax = nrow[i];
2462: for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2463: }
2464: #endif
2466: /* Update lens from offproc data */
2467: {
2468: PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2470: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2471: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2472: sbuf1_i = sbuf1[pa[tmp2]];
2473: jmax = sbuf1_i[0];
2474: ct1 = 2 * jmax + 1;
2475: ct2 = 0;
2476: rbuf2_i = rbuf2[tmp2];
2477: rbuf3_i = rbuf3[tmp2];
2478: for (PetscInt j = 1; j <= jmax; j++) {
2479: is_no = sbuf1_i[2 * j - 1];
2480: max1 = sbuf1_i[2 * j];
2481: lens_i = lens[is_no];
2482: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2483: rmap_i = rmap[is_no];
2484: for (PetscInt k = 0; k < max1; k++, ct1++) {
2485: #if defined(PETSC_USE_CTABLE)
2486: PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2487: row--;
2488: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2489: #else
2490: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2491: #endif
2492: max2 = rbuf2_i[ct1];
2493: for (PetscInt l = 0; l < max2; l++, ct2++) {
2494: if (!allcolumns[is_no]) {
2495: #if defined(PETSC_USE_CTABLE)
2496: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2497: #else
2498: tcol = cmap_i[rbuf3_i[ct2]];
2499: #endif
2500: if (tcol) lens_i[row]++;
2501: } else { /* allcolumns */
2502: lens_i[row]++; /* lens_i[row] += max2 ? */
2503: }
2504: }
2505: }
2506: }
2507: }
2508: }
2509: PetscCall(PetscFree(r_waits3));
2510: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2511: PetscCall(PetscFree(s_waits3));
2513: /* Create the submatrices */
2514: for (PetscInt i = 0; i < ismax; i++) {
2515: PetscInt rbs, cbs;
2517: PetscCall(ISGetBlockSize(isrow[i], &rbs));
2518: PetscCall(ISGetBlockSize(iscol[i], &cbs));
2520: PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2521: PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2523: if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2524: PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2525: PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2527: /* create struct Mat_SubSppt and attached it to submat */
2528: PetscCall(PetscNew(&smat_i));
2529: subc = (Mat_SeqAIJ *)submats[i]->data;
2530: subc->submatis1 = smat_i;
2532: smat_i->destroy = submats[i]->ops->destroy;
2533: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2534: submats[i]->factortype = C->factortype;
2536: smat_i->id = i;
2537: smat_i->nrqs = nrqs;
2538: smat_i->nrqr = nrqr;
2539: smat_i->rbuf1 = rbuf1;
2540: smat_i->rbuf2 = rbuf2;
2541: smat_i->rbuf3 = rbuf3;
2542: smat_i->sbuf2 = sbuf2;
2543: smat_i->req_source2 = req_source2;
2545: smat_i->sbuf1 = sbuf1;
2546: smat_i->ptr = ptr;
2547: smat_i->tmp = tmp;
2548: smat_i->ctr = ctr;
2550: smat_i->pa = pa;
2551: smat_i->req_size = req_size;
2552: smat_i->req_source1 = req_source1;
2554: smat_i->allcolumns = allcolumns[i];
2555: smat_i->singleis = PETSC_FALSE;
2556: smat_i->row2proc = row2proc[i];
2557: smat_i->rmap = rmap[i];
2558: smat_i->cmap = cmap[i];
2559: }
2561: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2562: PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2563: PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2564: PetscCall(MatSetType(submats[0], MATDUMMY));
2566: /* create struct Mat_SubSppt and attached it to submat */
2567: PetscCall(PetscNew(&smat_i));
2568: submats[0]->data = (void *)smat_i;
2570: smat_i->destroy = submats[0]->ops->destroy;
2571: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2572: submats[0]->factortype = C->factortype;
2574: smat_i->id = 0;
2575: smat_i->nrqs = nrqs;
2576: smat_i->nrqr = nrqr;
2577: smat_i->rbuf1 = rbuf1;
2578: smat_i->rbuf2 = rbuf2;
2579: smat_i->rbuf3 = rbuf3;
2580: smat_i->sbuf2 = sbuf2;
2581: smat_i->req_source2 = req_source2;
2583: smat_i->sbuf1 = sbuf1;
2584: smat_i->ptr = ptr;
2585: smat_i->tmp = tmp;
2586: smat_i->ctr = ctr;
2588: smat_i->pa = pa;
2589: smat_i->req_size = req_size;
2590: smat_i->req_source1 = req_source1;
2592: smat_i->allcolumns = PETSC_FALSE;
2593: smat_i->singleis = PETSC_FALSE;
2594: smat_i->row2proc = NULL;
2595: smat_i->rmap = NULL;
2596: smat_i->cmap = NULL;
2597: }
2599: if (ismax) PetscCall(PetscFree(lens[0]));
2600: PetscCall(PetscFree(lens));
2601: if (sbuf_aj) {
2602: PetscCall(PetscFree(sbuf_aj[0]));
2603: PetscCall(PetscFree(sbuf_aj));
2604: }
2606: } /* endof scall == MAT_INITIAL_MATRIX */
2608: /* Post recv matrix values */
2609: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2610: PetscCall(PetscMalloc1(nrqs, &rbuf4));
2611: PetscCall(PetscMalloc1(nrqs, &r_waits4));
2612: for (PetscMPIInt i = 0; i < nrqs; ++i) {
2613: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2614: PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2615: }
2617: /* Allocate sending buffers for a->a, and send them off */
2618: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2619: jcnt = 0;
2620: for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2621: if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
2622: for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2624: PetscCall(PetscMalloc1(nrqr, &s_waits4));
2625: {
2626: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2627: PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2628: PetscInt cend = C->cmap->rend;
2629: PetscInt *b_j = b->j;
2630: PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2632: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2633: PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2634: for (PetscMPIInt i = 0; i < nrqr; i++) {
2635: rbuf1_i = rbuf1[i];
2636: sbuf_aa_i = sbuf_aa[i];
2637: ct1 = 2 * rbuf1_i[0] + 1;
2638: ct2 = 0;
2639: for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2640: kmax = rbuf1_i[2 * j];
2641: for (PetscInt k = 0; k < kmax; k++, ct1++) {
2642: row = rbuf1_i[ct1] - rstart;
2643: nzA = a_i[row + 1] - a_i[row];
2644: nzB = b_i[row + 1] - b_i[row];
2645: ncols = nzA + nzB;
2646: cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2647: vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2648: vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2650: /* load the column values for this row into vals*/
2651: vals = sbuf_aa_i + ct2;
2653: lwrite = 0;
2654: for (PetscInt l = 0; l < nzB; l++) {
2655: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2656: }
2657: for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2658: for (PetscInt l = 0; l < nzB; l++) {
2659: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2660: }
2662: ct2 += ncols;
2663: }
2664: }
2665: PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2666: }
2667: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2668: PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2669: }
2671: /* Assemble the matrices */
2672: /* First assemble the local rows */
2673: for (PetscInt i = 0; i < ismax; i++) {
2674: row2proc_i = row2proc[i];
2675: subc = (Mat_SeqAIJ *)submats[i]->data;
2676: imat_ilen = subc->ilen;
2677: imat_j = subc->j;
2678: imat_i = subc->i;
2679: imat_a = subc->a;
2681: if (!allcolumns[i]) cmap_i = cmap[i];
2682: rmap_i = rmap[i];
2683: irow_i = irow[i];
2684: jmax = nrow[i];
2685: for (PetscInt j = 0; j < jmax; j++) {
2686: row = irow_i[j];
2687: proc = row2proc_i[j];
2688: if (proc == rank) {
2689: old_row = row;
2690: #if defined(PETSC_USE_CTABLE)
2691: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2692: row--;
2693: #else
2694: row = rmap_i[row];
2695: #endif
2696: ilen_row = imat_ilen[row];
2697: PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2698: mat_i = imat_i[row];
2699: mat_a = imat_a + mat_i;
2700: mat_j = imat_j + mat_i;
2701: if (!allcolumns[i]) {
2702: for (PetscInt k = 0; k < ncols; k++) {
2703: #if defined(PETSC_USE_CTABLE)
2704: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2705: #else
2706: tcol = cmap_i[cols[k]];
2707: #endif
2708: if (tcol) {
2709: *mat_j++ = tcol - 1;
2710: *mat_a++ = vals[k];
2711: ilen_row++;
2712: }
2713: }
2714: } else { /* allcolumns */
2715: for (PetscInt k = 0; k < ncols; k++) {
2716: *mat_j++ = cols[k]; /* global col index! */
2717: *mat_a++ = vals[k];
2718: ilen_row++;
2719: }
2720: }
2721: PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2723: imat_ilen[row] = ilen_row;
2724: }
2725: }
2726: }
2728: /* Now assemble the off proc rows */
2729: PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2730: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2731: sbuf1_i = sbuf1[pa[tmp2]];
2732: jmax = sbuf1_i[0];
2733: ct1 = 2 * jmax + 1;
2734: ct2 = 0;
2735: rbuf2_i = rbuf2[tmp2];
2736: rbuf3_i = rbuf3[tmp2];
2737: rbuf4_i = rbuf4[tmp2];
2738: for (PetscInt j = 1; j <= jmax; j++) {
2739: is_no = sbuf1_i[2 * j - 1];
2740: rmap_i = rmap[is_no];
2741: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2742: subc = (Mat_SeqAIJ *)submats[is_no]->data;
2743: imat_ilen = subc->ilen;
2744: imat_j = subc->j;
2745: imat_i = subc->i;
2746: imat_a = subc->a;
2747: max1 = sbuf1_i[2 * j];
2748: for (PetscInt k = 0; k < max1; k++, ct1++) {
2749: row = sbuf1_i[ct1];
2750: #if defined(PETSC_USE_CTABLE)
2751: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2752: row--;
2753: #else
2754: row = rmap_i[row];
2755: #endif
2756: ilen = imat_ilen[row];
2757: mat_i = imat_i[row];
2758: mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2759: mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2760: max2 = rbuf2_i[ct1];
2761: if (!allcolumns[is_no]) {
2762: for (PetscInt l = 0; l < max2; l++, ct2++) {
2763: #if defined(PETSC_USE_CTABLE)
2764: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2765: #else
2766: tcol = cmap_i[rbuf3_i[ct2]];
2767: #endif
2768: if (tcol) {
2769: *mat_j++ = tcol - 1;
2770: *mat_a++ = rbuf4_i[ct2];
2771: ilen++;
2772: }
2773: }
2774: } else { /* allcolumns */
2775: for (PetscInt l = 0; l < max2; l++, ct2++) {
2776: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2777: *mat_a++ = rbuf4_i[ct2];
2778: ilen++;
2779: }
2780: }
2781: imat_ilen[row] = ilen;
2782: }
2783: }
2784: }
2786: if (!iscsorted) { /* sort column indices of the rows */
2787: for (PetscInt i = 0; i < ismax; i++) {
2788: subc = (Mat_SeqAIJ *)submats[i]->data;
2789: imat_j = subc->j;
2790: imat_i = subc->i;
2791: imat_a = subc->a;
2792: imat_ilen = subc->ilen;
2794: if (allcolumns[i]) continue;
2795: jmax = nrow[i];
2796: for (PetscInt j = 0; j < jmax; j++) {
2797: mat_i = imat_i[j];
2798: mat_a = imat_a + mat_i;
2799: mat_j = imat_j + mat_i;
2800: PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2801: }
2802: }
2803: }
2805: PetscCall(PetscFree(r_waits4));
2806: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2807: PetscCall(PetscFree(s_waits4));
2809: /* Restore the indices */
2810: for (PetscInt i = 0; i < ismax; i++) {
2811: PetscCall(ISRestoreIndices(isrow[i], irow + i));
2812: if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2813: }
2815: for (PetscInt i = 0; i < ismax; i++) {
2816: PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2817: PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2818: }
2820: /* Destroy allocated memory */
2821: if (sbuf_aa) {
2822: PetscCall(PetscFree(sbuf_aa[0]));
2823: PetscCall(PetscFree(sbuf_aa));
2824: }
2825: PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2827: for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2828: PetscCall(PetscFree(rbuf4));
2830: PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2831: PetscFunctionReturn(PETSC_SUCCESS);
2832: }
2834: /*
2835: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2836: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2837: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2838: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2839: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2840: state, and needs to be "assembled" later by compressing B's column space.
2842: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2843: Following this call, C->A & C->B have been created, even if empty.
2844: */
2845: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2846: {
2847: /* If making this function public, change the error returned in this function away from _PLIB. */
2848: Mat_MPIAIJ *aij;
2849: Mat_SeqAIJ *Baij;
2850: PetscBool seqaij, Bdisassembled;
2851: PetscInt m, n, *nz, ngcol, col, rstart, rend, shift, count;
2852: PetscScalar v;
2853: const PetscInt *rowindices, *colindices;
2855: PetscFunctionBegin;
2856: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2857: if (A) {
2858: PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2859: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2860: if (rowemb) {
2861: PetscCall(ISGetLocalSize(rowemb, &m));
2862: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2863: } else {
2864: PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2865: }
2866: if (dcolemb) {
2867: PetscCall(ISGetLocalSize(dcolemb, &n));
2868: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2869: } else {
2870: PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2871: }
2872: }
2873: if (B) {
2874: PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2875: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2876: if (rowemb) {
2877: PetscCall(ISGetLocalSize(rowemb, &m));
2878: PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2879: } else {
2880: PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2881: }
2882: if (ocolemb) {
2883: PetscCall(ISGetLocalSize(ocolemb, &n));
2884: PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2885: } else {
2886: PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2887: }
2888: }
2890: aij = (Mat_MPIAIJ *)C->data;
2891: if (!aij->A) {
2892: /* Mimic parts of MatMPIAIJSetPreallocation() */
2893: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2894: PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2895: PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2896: PetscCall(MatSetType(aij->A, MATSEQAIJ));
2897: }
2898: if (A) {
2899: PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2900: } else {
2901: PetscCall(MatSetUp(aij->A));
2902: }
2903: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2904: /*
2905: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2906: need to "disassemble" B -- convert it to using C's global indices.
2907: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2909: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2910: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2912: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2913: At least avoid calling MatSetValues() and the implied searches?
2914: */
2916: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2917: #if defined(PETSC_USE_CTABLE)
2918: PetscCall(PetscHMapIDestroy(&aij->colmap));
2919: #else
2920: PetscCall(PetscFree(aij->colmap));
2921: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2922: #endif
2923: ngcol = 0;
2924: if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2925: if (aij->garray) PetscCall(PetscFree(aij->garray));
2926: PetscCall(VecDestroy(&aij->lvec));
2927: PetscCall(VecScatterDestroy(&aij->Mvctx));
2928: }
2929: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2930: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2931: }
2932: Bdisassembled = PETSC_FALSE;
2933: if (!aij->B) {
2934: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2935: PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2936: PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2937: PetscCall(MatSetType(aij->B, MATSEQAIJ));
2938: Bdisassembled = PETSC_TRUE;
2939: }
2940: if (B) {
2941: Baij = (Mat_SeqAIJ *)B->data;
2942: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2943: PetscCall(PetscMalloc1(B->rmap->n, &nz));
2944: for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2945: PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2946: PetscCall(PetscFree(nz));
2947: }
2949: PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2950: shift = rend - rstart;
2951: count = 0;
2952: rowindices = NULL;
2953: colindices = NULL;
2954: if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2955: if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2956: for (PetscInt i = 0; i < B->rmap->n; i++) {
2957: PetscInt row;
2958: row = i;
2959: if (rowindices) row = rowindices[i];
2960: for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2961: col = Baij->j[count];
2962: if (colindices) col = colindices[col];
2963: if (Bdisassembled && col >= rstart) col += shift;
2964: v = Baij->a[count];
2965: PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2966: ++count;
2967: }
2968: }
2969: /* No assembly for aij->B is necessary. */
2970: /* FIXME: set aij->B's nonzerostate correctly. */
2971: } else {
2972: PetscCall(MatSetUp(aij->B));
2973: }
2974: C->preallocated = PETSC_TRUE;
2975: C->was_assembled = PETSC_FALSE;
2976: C->assembled = PETSC_FALSE;
2977: /*
2978: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2979: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2980: */
2981: PetscFunctionReturn(PETSC_SUCCESS);
2982: }
2984: /*
2985: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2986: */
2987: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2988: {
2989: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
2991: PetscFunctionBegin;
2992: PetscAssertPointer(A, 2);
2993: PetscAssertPointer(B, 3);
2994: /* FIXME: make sure C is assembled */
2995: *A = aij->A;
2996: *B = aij->B;
2997: /* Note that we don't incref *A and *B, so be careful! */
2998: PetscFunctionReturn(PETSC_SUCCESS);
2999: }
3001: /*
3002: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3003: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3004: */
3005: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3006: {
3007: PetscMPIInt size, flag;
3008: PetscInt cismax, ispar;
3009: Mat *A, *B;
3010: IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3012: PetscFunctionBegin;
3013: if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3015: cismax = 0;
3016: for (PetscInt i = 0; i < ismax; ++i) {
3017: PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3018: PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3019: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3020: if (size > 1) ++cismax;
3021: }
3023: /*
3024: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3025: ispar counts the number of parallel ISs across C's comm.
3026: */
3027: PetscCallMPI(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3028: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3029: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3030: PetscFunctionReturn(PETSC_SUCCESS);
3031: }
3033: /* if (ispar) */
3034: /*
3035: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3036: These are used to extract the off-diag portion of the resulting parallel matrix.
3037: The row IS for the off-diag portion is the same as for the diag portion,
3038: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3039: */
3040: PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3041: PetscCall(PetscMalloc1(cismax, &ciscol_p));
3042: for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3043: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3044: if (size > 1) {
3045: /*
3046: TODO: This is the part that's ***NOT SCALABLE***.
3047: To fix this we need to extract just the indices of C's nonzero columns
3048: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3049: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3050: to be done without serializing on the IS list, so, most likely, it is best
3051: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3052: */
3053: PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3054: /* Now we have to
3055: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3056: were sorted on each rank, concatenated they might no longer be sorted;
3057: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3058: indices in the nondecreasing order to the original index positions.
3059: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3060: */
3061: PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3062: PetscCall(ISSort(ciscol[ii]));
3063: ++ii;
3064: }
3065: }
3066: PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3067: for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3068: PetscInt issize;
3069: const PetscInt *indices;
3071: /*
3072: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3073: */
3074: PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3075: PetscCall(ISSort(isrow[i]));
3076: PetscCall(ISGetLocalSize(isrow[i], &issize));
3077: PetscCall(ISGetIndices(isrow[i], &indices));
3078: for (PetscInt j = 1; j < issize; ++j) {
3079: PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3080: }
3081: PetscCall(ISRestoreIndices(isrow[i], &indices));
3082: PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3083: PetscCall(ISSort(iscol[i]));
3084: PetscCall(ISGetLocalSize(iscol[i], &issize));
3085: PetscCall(ISGetIndices(iscol[i], &indices));
3086: for (PetscInt j = 1; j < issize; ++j) {
3087: PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3088: }
3089: PetscCall(ISRestoreIndices(iscol[i], &indices));
3090: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3091: if (size > 1) {
3092: cisrow[ii] = isrow[i];
3093: ++ii;
3094: }
3095: }
3096: /*
3097: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3098: array of sequential matrices underlying the resulting parallel matrices.
3099: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3100: contain duplicates.
3102: There are as many diag matrices as there are original index sets. There are only as many parallel
3103: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3105: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3106: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3107: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3108: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3109: with A[i] and B[ii] extracted from the corresponding MPI submat.
3110: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3111: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3112: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3113: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3114: values into A[i] and B[ii] sitting inside the corresponding submat.
3115: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3116: A[i], B[ii], AA[i] or BB[ii] matrices.
3117: */
3118: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3119: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3121: /* Now obtain the sequential A and B submatrices separately. */
3122: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3123: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3124: PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3126: /*
3127: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3128: matrices A & B have been extracted directly into the parallel matrices containing them, or
3129: simply into the sequential matrix identical with the corresponding A (if size == 1).
3130: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3131: to have the same sparsity pattern.
3132: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3133: must be constructed for C. This is done by setseqmat(s).
3134: */
3135: for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3136: /*
3137: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3138: That way we can avoid sorting and computing permutations when reusing.
3139: To this end:
3140: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3141: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3142: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3143: */
3144: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3146: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3147: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3148: if (size > 1) {
3149: if (scall == MAT_INITIAL_MATRIX) {
3150: PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3151: PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3152: PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3153: PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3154: PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3155: }
3156: /*
3157: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3158: */
3159: {
3160: Mat AA = A[i], BB = B[ii];
3162: if (AA || BB) {
3163: PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3164: PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3165: PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3166: }
3167: PetscCall(MatDestroy(&AA));
3168: }
3169: PetscCall(ISDestroy(ciscol + ii));
3170: PetscCall(ISDestroy(ciscol_p + ii));
3171: ++ii;
3172: } else { /* if (size == 1) */
3173: if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3174: if (isrow_p[i] || iscol_p[i]) {
3175: PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3176: PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3177: /* Otherwise A is extracted straight into (*submats)[i]. */
3178: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3179: PetscCall(MatDestroy(A + i));
3180: } else (*submat)[i] = A[i];
3181: }
3182: PetscCall(ISDestroy(&isrow_p[i]));
3183: PetscCall(ISDestroy(&iscol_p[i]));
3184: }
3185: PetscCall(PetscFree2(cisrow, ciscol));
3186: PetscCall(PetscFree2(isrow_p, iscol_p));
3187: PetscCall(PetscFree(ciscol_p));
3188: PetscCall(PetscFree(A));
3189: PetscCall(MatDestroySubMatrices(cismax, &B));
3190: PetscFunctionReturn(PETSC_SUCCESS);
3191: }
3193: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3194: {
3195: PetscFunctionBegin;
3196: PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3197: PetscFunctionReturn(PETSC_SUCCESS);
3198: }