Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);

 21: /*
 22:    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks

 24:    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
 25:    that is a very major recoding job.

 27:    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
 28: */
 29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
 30: {
 31:   PetscFunctionBegin;
 32:   for (PetscInt i = 0; i < imax; i++) {
 33:     if (!is[i]) break;
 34:     PetscInt        n = 0, N, Nmax, Nmin;
 35:     const PetscInt *idx;
 36:     PetscInt       *nidx = NULL;
 37:     MPI_Comm        comm;
 38:     PetscBT         bt;

 40:     PetscCall(ISGetLocalSize(is[i], &N));
 41:     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
 42:       PetscCall(ISGetIndices(is[i], &idx));
 43:       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
 44:       Nmin = Nmin / bs;
 45:       Nmax = Nmax / bs;
 46:       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
 47:       for (PetscInt j = 0; j < N; j++) {
 48:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
 49:       }
 50:       PetscCall(PetscMalloc1(n, &nidx));
 51:       n = 0;
 52:       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
 53:       for (PetscInt j = 0; j < N; j++) {
 54:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
 55:       }
 56:       PetscCall(PetscBTDestroy(&bt));
 57:       PetscCall(ISRestoreIndices(is[i], &idx));
 58:     }
 59:     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
 60:     PetscCall(ISDestroy(is + i));
 61:     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
 62:   }
 63:   PetscFunctionReturn(PETSC_SUCCESS);
 64: }

 66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
 67: {
 68:   PetscInt i;

 70:   PetscFunctionBegin;
 71:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 72:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
 73:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 74:   PetscFunctionReturn(PETSC_SUCCESS);
 75: }

 77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
 78: {
 79:   PetscInt i;

 81:   PetscFunctionBegin;
 82:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 83:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
 84:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 85:   PetscFunctionReturn(PETSC_SUCCESS);
 86: }

 88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
 89: {
 90:   MPI_Comm        comm;
 91:   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j;
 92:   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
 93:   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
 94:   const PetscInt *indices_i, **indices;
 95:   PetscLayout     rmap;
 96:   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks;
 97:   PetscSF         sf;
 98:   PetscSFNode    *remote;

100:   PetscFunctionBegin;
101:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103:   PetscCallMPI(MPI_Comm_size(comm, &size));
104:   /* get row map to determine where rows should be going */
105:   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106:   /* retrieve IS data and put all together so that we
107:    * can optimize communication
108:    *  */
109:   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110:   tlength = 0;
111:   for (PetscInt i = 0; i < nidx; i++) {
112:     PetscCall(ISGetLocalSize(is[i], &length[i]));
113:     tlength += length[i];
114:     PetscCall(ISGetIndices(is[i], &indices[i]));
115:   }
116:   /* find these rows on remote processors */
117:   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
118:   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
119:   nrrows = 0;
120:   for (PetscInt i = 0; i < nidx; i++) {
121:     length_i  = length[i];
122:     indices_i = indices[i];
123:     for (PetscInt j = 0; j < length_i; j++) {
124:       owner = -1;
125:       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
126:       /* remote processors */
127:       if (owner != rank) {
128:         tosizes_temp[owner]++;               /* number of rows to owner */
129:         rrow_ranks[nrrows]   = owner;        /* processor */
130:         rrow_isids[nrrows]   = i;            /* is id */
131:         remoterows[nrrows++] = indices_i[j]; /* row */
132:       }
133:     }
134:     PetscCall(ISRestoreIndices(is[i], &indices[i]));
135:   }
136:   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
137:   /* test if we need to exchange messages
138:    * generally speaking, we do not need to exchange
139:    * data when overlap is 1
140:    * */
141:   PetscCallMPI(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
142:   /* we do not have any messages
143:    * It usually corresponds to overlap 1
144:    * */
145:   if (!reducednrrows) {
146:     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
147:     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
148:     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
149:     PetscFunctionReturn(PETSC_SUCCESS);
150:   }
151:   nto = 0;
152:   /* send sizes and ranks for building a two-sided communication */
153:   for (PetscMPIInt i = 0; i < size; i++) {
154:     if (tosizes_temp[i]) {
155:       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
156:       tosizes_temp[i]  = nto;                 /* a map from processor to index */
157:       toranks[nto++]   = i;                   /* MPI process */
158:     }
159:   }
160:   PetscCall(PetscMalloc1(nto + 1, &toffsets));
161:   toffsets[0] = 0;
162:   for (PetscInt i = 0; i < nto; i++) {
163:     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
164:     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
165:   }
166:   /* send information to other processors */
167:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
168:   nrecvrows = 0;
169:   for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
170:   PetscCall(PetscMalloc1(nrecvrows, &remote));
171:   nrecvrows = 0;
172:   for (PetscMPIInt i = 0; i < nfrom; i++) {
173:     for (PetscInt j = 0; j < fromsizes[2 * i]; j++) {
174:       remote[nrecvrows].rank    = fromranks[i];
175:       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
176:     }
177:   }
178:   PetscCall(PetscSFCreate(comm, &sf));
179:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
180:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
181:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
182:   PetscCall(PetscSFSetFromOptions(sf));
183:   /* message pair <no of is, row>  */
184:   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
185:   for (PetscInt i = 0; i < nrrows; i++) {
186:     owner                 = rrow_ranks[i];       /* process */
187:     j                     = tosizes_temp[owner]; /* index */
188:     todata[toffsets[j]++] = rrow_isids[i];
189:     todata[toffsets[j]++] = remoterows[i];
190:   }
191:   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
192:   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
193:   PetscCall(PetscFree(toffsets));
194:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
196:   PetscCall(PetscSFDestroy(&sf));
197:   /* send rows belonging to the remote so that then we could get the overlapping data back */
198:   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
199:   PetscCall(PetscFree2(todata, fromdata));
200:   PetscCall(PetscFree(fromsizes));
201:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
202:   PetscCall(PetscFree(fromranks));
203:   nrecvrows = 0;
204:   for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
205:   PetscCall(PetscCalloc1(nrecvrows, &todata));
206:   PetscCall(PetscMalloc1(nrecvrows, &remote));
207:   nrecvrows = 0;
208:   for (PetscInt i = 0; i < nto; i++) {
209:     for (PetscInt j = 0; j < tosizes[2 * i]; j++) {
210:       remote[nrecvrows].rank    = toranks[i];
211:       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
212:     }
213:   }
214:   PetscCall(PetscSFCreate(comm, &sf));
215:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
216:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
217:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
218:   PetscCall(PetscSFSetFromOptions(sf));
219:   /* overlap communication and computation */
220:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
221:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
222:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
223:   PetscCall(PetscSFDestroy(&sf));
224:   PetscCall(PetscFree2(sbdata, sbsizes));
225:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
226:   PetscCall(PetscFree(toranks));
227:   PetscCall(PetscFree(tosizes));
228:   PetscCall(PetscFree(todata));
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
233: {
234:   PetscInt       *isz, isz_i, i, j, is_id, data_size;
235:   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
236:   const PetscInt *indices_i_temp;
237:   MPI_Comm       *iscomms;

239:   PetscFunctionBegin;
240:   max_lsize = 0;
241:   PetscCall(PetscMalloc1(nidx, &isz));
242:   for (i = 0; i < nidx; i++) {
243:     PetscCall(ISGetLocalSize(is[i], &lsize));
244:     max_lsize = lsize > max_lsize ? lsize : max_lsize;
245:     isz[i]    = lsize;
246:   }
247:   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
248:   for (i = 0; i < nidx; i++) {
249:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
250:     PetscCall(ISGetIndices(is[i], &indices_i_temp));
251:     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
252:     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
253:     PetscCall(ISDestroy(&is[i]));
254:   }
255:   /* retrieve information to get row id and its overlap */
256:   for (i = 0; i < nrecvs;) {
257:     is_id     = recvdata[i++];
258:     data_size = recvdata[i++];
259:     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
260:     isz_i     = isz[is_id];
261:     for (j = 0; j < data_size; j++) {
262:       col                = recvdata[i++];
263:       indices_i[isz_i++] = col;
264:     }
265:     isz[is_id] = isz_i;
266:   }
267:   /* remove duplicate entities */
268:   for (i = 0; i < nidx; i++) {
269:     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
270:     isz_i     = isz[i];
271:     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
272:     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
273:     PetscCall(PetscCommDestroy(&iscomms[i]));
274:   }
275:   PetscCall(PetscFree(isz));
276:   PetscCall(PetscFree2(indices_temp, iscomms));
277:   PetscFunctionReturn(PETSC_SUCCESS);
278: }

280: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
281: {
282:   PetscLayout     rmap, cmap;
283:   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
284:   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
285:   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
286:   const PetscInt *gcols, *ai, *aj, *bi, *bj;
287:   Mat             amat, bmat;
288:   PetscMPIInt     rank;
289:   PetscBool       done;
290:   MPI_Comm        comm;

292:   PetscFunctionBegin;
293:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
294:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
295:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
296:   /* Even if the mat is symmetric, we still assume it is not symmetric */
297:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
298:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
299:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
300:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
301:   /* total number of nonzero values is used to estimate the memory usage in the next step */
302:   tnz = ai[an] + bi[bn];
303:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
304:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
305:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
306:   /* to find the longest message */
307:   max_fszs = 0;
308:   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
309:   /* better way to estimate number of nonzero in the mat??? */
310:   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
311:   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
312:   rows_pos  = 0;
313:   totalrows = 0;
314:   for (i = 0; i < nfrom; i++) {
315:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
316:     /* group data together */
317:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
318:       is_id                       = fromrows[rows_pos++]; /* no of is */
319:       rows_i                      = rows_data[is_id];
320:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
321:     }
322:     /* estimate a space to avoid multiple allocations  */
323:     for (j = 0; j < nidx; j++) {
324:       indvc_ij = 0;
325:       rows_i   = rows_data[j];
326:       for (l = 0; l < rows_pos_i[j]; l++) {
327:         row   = rows_i[l] - rstart;
328:         start = ai[row];
329:         end   = ai[row + 1];
330:         for (k = start; k < end; k++) { /* Amat */
331:           col                     = aj[k] + cstart;
332:           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
333:         }
334:         start = bi[row];
335:         end   = bi[row + 1];
336:         for (k = start; k < end; k++) { /* Bmat */
337:           col                     = gcols[bj[k]];
338:           indices_tmp[indvc_ij++] = col;
339:         }
340:       }
341:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
342:       indv_counts[i * nidx + j] = indvc_ij;
343:       totalrows += indvc_ij;
344:     }
345:   }
346:   /* message triple <no of is, number of rows, rows> */
347:   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
348:   totalrows = 0;
349:   rows_pos  = 0;
350:   /* use this code again */
351:   for (i = 0; i < nfrom; i++) {
352:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
353:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
354:       is_id                       = fromrows[rows_pos++];
355:       rows_i                      = rows_data[is_id];
356:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
357:     }
358:     /* add data  */
359:     for (j = 0; j < nidx; j++) {
360:       if (!indv_counts[i * nidx + j]) continue;
361:       indvc_ij            = 0;
362:       sbdata[totalrows++] = j;
363:       sbdata[totalrows++] = indv_counts[i * nidx + j];
364:       sbsizes[2 * i] += 2;
365:       rows_i = rows_data[j];
366:       for (l = 0; l < rows_pos_i[j]; l++) {
367:         row   = rows_i[l] - rstart;
368:         start = ai[row];
369:         end   = ai[row + 1];
370:         for (k = start; k < end; k++) { /* Amat */
371:           col                     = aj[k] + cstart;
372:           indices_tmp[indvc_ij++] = col;
373:         }
374:         start = bi[row];
375:         end   = bi[row + 1];
376:         for (k = start; k < end; k++) { /* Bmat */
377:           col                     = gcols[bj[k]];
378:           indices_tmp[indvc_ij++] = col;
379:         }
380:       }
381:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
382:       sbsizes[2 * i] += indvc_ij;
383:       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
384:       totalrows += indvc_ij;
385:     }
386:   }
387:   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
388:   offsets[0] = 0;
389:   for (i = 0; i < nfrom; i++) {
390:     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
391:     sbsizes[2 * i + 1] = offsets[i];
392:   }
393:   PetscCall(PetscFree(offsets));
394:   if (sbrowsizes) *sbrowsizes = sbsizes;
395:   if (sbrows) *sbrows = sbdata;
396:   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
397:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
398:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
399:   PetscFunctionReturn(PETSC_SUCCESS);
400: }

402: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
403: {
404:   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
405:   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
406:   PetscInt        lsize, lsize_tmp;
407:   PetscMPIInt     rank, owner;
408:   Mat             amat, bmat;
409:   PetscBool       done;
410:   PetscLayout     cmap, rmap;
411:   MPI_Comm        comm;

413:   PetscFunctionBegin;
414:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
415:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
416:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
417:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
418:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
419:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
420:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
421:   /* is it a safe way to compute number of nonzero values ? */
422:   tnz = ai[an] + bi[bn];
423:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
424:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
425:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
426:   /* it is a better way to estimate memory than the old implementation
427:    * where global size of matrix is used
428:    * */
429:   PetscCall(PetscMalloc1(tnz, &indices_temp));
430:   for (i = 0; i < nidx; i++) {
431:     MPI_Comm iscomm;

433:     PetscCall(ISGetLocalSize(is[i], &lsize));
434:     PetscCall(ISGetIndices(is[i], &indices));
435:     lsize_tmp = 0;
436:     for (j = 0; j < lsize; j++) {
437:       owner = -1;
438:       row   = indices[j];
439:       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
440:       if (owner != rank) continue;
441:       /* local number */
442:       row -= rstart;
443:       start = ai[row];
444:       end   = ai[row + 1];
445:       for (k = start; k < end; k++) { /* Amat */
446:         col                       = aj[k] + cstart;
447:         indices_temp[lsize_tmp++] = col;
448:       }
449:       start = bi[row];
450:       end   = bi[row + 1];
451:       for (k = start; k < end; k++) { /* Bmat */
452:         col                       = gcols[bj[k]];
453:         indices_temp[lsize_tmp++] = col;
454:       }
455:     }
456:     PetscCall(ISRestoreIndices(is[i], &indices));
457:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
458:     PetscCall(ISDestroy(&is[i]));
459:     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
460:     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
461:     PetscCall(PetscCommDestroy(&iscomm));
462:   }
463:   PetscCall(PetscFree(indices_temp));
464:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
465:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
466:   PetscFunctionReturn(PETSC_SUCCESS);
467: }

469: /*
470:   Sample message format:
471:   If a processor A wants processor B to process some elements corresponding
472:   to index sets is[1],is[5]
473:   mesg [0] = 2   (no of index sets in the mesg)
474:   -----------
475:   mesg [1] = 1 => is[1]
476:   mesg [2] = sizeof(is[1]);
477:   -----------
478:   mesg [3] = 5  => is[5]
479:   mesg [4] = sizeof(is[5]);
480:   -----------
481:   mesg [5]
482:   mesg [n]  datas[1]
483:   -----------
484:   mesg[n+1]
485:   mesg[m]  data(is[5])
486:   -----------

488:   Notes:
489:   nrqs - no of requests sent (or to be sent out)
490:   nrqr - no of requests received (which have to be or which have been processed)
491: */
492: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
493: {
494:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
495:   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
496:   const PetscInt **idx, *idx_i;
497:   PetscInt        *n, **data, len;
498: #if defined(PETSC_USE_CTABLE)
499:   PetscHMapI *table_data, table_data_i;
500:   PetscInt   *tdata, tcount, tcount_max;
501: #else
502:   PetscInt *data_i, *d_p;
503: #endif
504:   PetscMPIInt  size, rank, tag1, tag2, proc = 0, nrqs, *pa;
505:   PetscInt     M, k, **rbuf, row, msz, **outdat, **ptr, *isz1;
506:   PetscInt    *ctr, *tmp, *isz, **xdata, **rbuf2;
507:   PetscBT     *table;
508:   MPI_Comm     comm;
509:   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
510:   MPI_Status  *recv_status;
511:   MPI_Comm    *iscomms;
512:   char        *t_p;

514:   PetscFunctionBegin;
515:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
516:   size = c->size;
517:   rank = c->rank;
518:   M    = C->rmap->N;

520:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
521:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));

523:   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));

525:   for (PetscInt i = 0; i < imax; i++) {
526:     PetscCall(ISGetIndices(is[i], &idx[i]));
527:     PetscCall(ISGetLocalSize(is[i], &n[i]));
528:   }

530:   /* evaluate communication - mesg to who,length of mesg, and buffer space
531:      required. Based on this, buffers are allocated, and data copied into them  */
532:   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
533:   for (PetscInt i = 0; i < imax; i++) {
534:     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
535:     idx_i = idx[i];
536:     len   = n[i];
537:     for (PetscInt j = 0; j < len; j++) {
538:       row = idx_i[j];
539:       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
540:       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
541:       w4[proc]++;
542:     }
543:     for (PetscMPIInt j = 0; j < size; j++) {
544:       if (w4[j]) {
545:         w1[j] += w4[j];
546:         w3[j]++;
547:       }
548:     }
549:   }

551:   nrqs     = 0; /* no of outgoing messages */
552:   msz      = 0; /* total mesg length (for all proc */
553:   w1[rank] = 0; /* no mesg sent to intself */
554:   w3[rank] = 0;
555:   for (PetscMPIInt i = 0; i < size; i++) {
556:     if (w1[i]) {
557:       w2[i] = 1;
558:       nrqs++;
559:     } /* there exists a message to proc i */
560:   }
561:   /* pa - is list of processors to communicate with */
562:   PetscCall(PetscMalloc1(nrqs, &pa));
563:   for (PetscMPIInt i = 0, j = 0; i < size; i++) {
564:     if (w1[i]) {
565:       pa[j] = i;
566:       j++;
567:     }
568:   }

570:   /* Each message would have a header = 1 + 2*(no of IS) + data */
571:   for (PetscMPIInt i = 0; i < nrqs; i++) {
572:     PetscMPIInt j = pa[i];
573:     w1[j] += w2[j] + 2 * w3[j];
574:     msz += w1[j];
575:   }

577:   /* Determine the number of messages to expect, their lengths, from from-ids */
578:   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
579:   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

581:   /* Now post the Irecvs corresponding to these messages */
582:   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));

584:   /* Allocate Memory for outgoing messages */
585:   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
586:   PetscCall(PetscArrayzero(outdat, size));
587:   PetscCall(PetscArrayzero(ptr, size));

589:   {
590:     PetscInt *iptr = tmp, ict = 0;
591:     for (PetscMPIInt i = 0; i < nrqs; i++) {
592:       PetscMPIInt j = pa[i];
593:       iptr += ict;
594:       outdat[j] = iptr;
595:       ict       = w1[j];
596:     }
597:   }

599:   /* Form the outgoing messages */
600:   /* plug in the headers */
601:   for (PetscMPIInt i = 0; i < nrqs; i++) {
602:     PetscMPIInt j = pa[i];
603:     outdat[j][0]  = 0;
604:     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
605:     ptr[j] = outdat[j] + 2 * w3[j] + 1;
606:   }

608:   /* Memory for doing local proc's work */
609:   {
610:     PetscInt M_BPB_imax = 0;
611: #if defined(PETSC_USE_CTABLE)
612:     PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
613:     PetscCall(PetscMalloc1(imax, &table_data));
614:     for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
615:     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
616:     for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
617: #else
618:     PetscInt Mimax = 0;
619:     PetscCall(PetscIntMultError(M, imax, &Mimax));
620:     PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax));
621:     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
622:     for (PetscInt i = 0; i < imax; i++) {
623:       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
624:       data[i]  = d_p + M * i;
625:     }
626: #endif
627:   }

629:   /* Parse the IS and update local tables and the outgoing buf with the data */
630:   {
631:     PetscInt n_i, isz_i, *outdat_j, ctr_j;
632:     PetscBT  table_i;

634:     for (PetscInt i = 0; i < imax; i++) {
635:       PetscCall(PetscArrayzero(ctr, size));
636:       n_i     = n[i];
637:       table_i = table[i];
638:       idx_i   = idx[i];
639: #if defined(PETSC_USE_CTABLE)
640:       table_data_i = table_data[i];
641: #else
642:       data_i = data[i];
643: #endif
644:       isz_i = isz[i];
645:       for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */
646:         row = idx_i[j];
647:         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
648:         if (proc != rank) { /* copy to the outgoing buffer */
649:           ctr[proc]++;
650:           *ptr[proc] = row;
651:           ptr[proc]++;
652:         } else if (!PetscBTLookupSet(table_i, row)) {
653: #if defined(PETSC_USE_CTABLE)
654:           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
655: #else
656:           data_i[isz_i] = row; /* Update the local table */
657: #endif
658:           isz_i++;
659:         }
660:       }
661:       /* Update the headers for the current IS */
662:       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
663:         if ((ctr_j = ctr[j])) {
664:           outdat_j            = outdat[j];
665:           k                   = ++outdat_j[0];
666:           outdat_j[2 * k]     = ctr_j;
667:           outdat_j[2 * k - 1] = i;
668:         }
669:       }
670:       isz[i] = isz_i;
671:     }
672:   }

674:   /*  Now  post the sends */
675:   PetscCall(PetscMalloc1(nrqs, &s_waits1));
676:   for (PetscMPIInt i = 0; i < nrqs; ++i) {
677:     PetscMPIInt j = pa[i];
678:     PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
679:   }

681:   /* No longer need the original indices */
682:   PetscCall(PetscMalloc1(imax, &iscomms));
683:   for (PetscInt i = 0; i < imax; ++i) {
684:     PetscCall(ISRestoreIndices(is[i], idx + i));
685:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
686:   }
687:   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));

689:   for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));

691:   /* Do Local work */
692: #if defined(PETSC_USE_CTABLE)
693:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
694: #else
695:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
696: #endif

698:   /* Receive messages */
699:   PetscCall(PetscMalloc1(nrqr, &recv_status));
700:   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
701:   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));

703:   /* Phase 1 sends are complete - deallocate buffers */
704:   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
705:   PetscCall(PetscFree4(w1, w2, w3, w4));

707:   PetscCall(PetscMalloc1(nrqr, &xdata));
708:   PetscCall(PetscMalloc1(nrqr, &isz1));
709:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
710:   PetscCall(PetscFree(rbuf[0]));
711:   PetscCall(PetscFree(rbuf));

713:   /* Send the data back */
714:   /* Do a global reduction to know the buffer space req for incoming messages */
715:   {
716:     PetscMPIInt *rw1;

718:     PetscCall(PetscCalloc1(size, &rw1));
719:     for (PetscMPIInt i = 0; i < nrqr; ++i) {
720:       proc = recv_status[i].MPI_SOURCE;
721:       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
722:       PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc]));
723:     }
724:     PetscCall(PetscFree(onodes1));
725:     PetscCall(PetscFree(olengths1));

727:     /* Determine the number of messages to expect, their lengths, from from-ids */
728:     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
729:     PetscCall(PetscFree(rw1));
730:   }
731:   /* Now post the Irecvs corresponding to these messages */
732:   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));

734:   /* Now  post the sends */
735:   PetscCall(PetscMalloc1(nrqr, &s_waits2));
736:   for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); }

738:   /* receive work done on other processors */
739:   {
740:     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
741:     PetscMPIInt idex;
742:     PetscBT     table_i;

744:     for (PetscMPIInt i = 0; i < nrqs; ++i) {
745:       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
746:       /* Process the message */
747:       rbuf2_i = rbuf2[idex];
748:       ct1     = 2 * rbuf2_i[0] + 1;
749:       jmax    = rbuf2[idex][0];
750:       for (PetscInt j = 1; j <= jmax; j++) {
751:         max     = rbuf2_i[2 * j];
752:         is_no   = rbuf2_i[2 * j - 1];
753:         isz_i   = isz[is_no];
754:         table_i = table[is_no];
755: #if defined(PETSC_USE_CTABLE)
756:         table_data_i = table_data[is_no];
757: #else
758:         data_i = data[is_no];
759: #endif
760:         for (k = 0; k < max; k++, ct1++) {
761:           row = rbuf2_i[ct1];
762:           if (!PetscBTLookupSet(table_i, row)) {
763: #if defined(PETSC_USE_CTABLE)
764:             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
765: #else
766:             data_i[isz_i] = row;
767: #endif
768:             isz_i++;
769:           }
770:         }
771:         isz[is_no] = isz_i;
772:       }
773:     }

775:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
776:   }

778: #if defined(PETSC_USE_CTABLE)
779:   tcount_max = 0;
780:   for (PetscInt i = 0; i < imax; ++i) {
781:     table_data_i = table_data[i];
782:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
783:     if (tcount_max < tcount) tcount_max = tcount;
784:   }
785:   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
786: #endif

788:   for (PetscInt i = 0; i < imax; ++i) {
789: #if defined(PETSC_USE_CTABLE)
790:     PetscHashIter tpos;
791:     PetscInt      j;

793:     table_data_i = table_data[i];
794:     PetscHashIterBegin(table_data_i, tpos);
795:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
796:       PetscHashIterGetKey(table_data_i, tpos, k);
797:       PetscHashIterGetVal(table_data_i, tpos, j);
798:       PetscHashIterNext(table_data_i, tpos);
799:       tdata[--j] = --k;
800:     }
801:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
802: #else
803:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
804: #endif
805:     PetscCall(PetscCommDestroy(&iscomms[i]));
806:   }

808:   PetscCall(PetscFree(iscomms));
809:   PetscCall(PetscFree(onodes2));
810:   PetscCall(PetscFree(olengths2));

812:   PetscCall(PetscFree(pa));
813:   PetscCall(PetscFree(rbuf2[0]));
814:   PetscCall(PetscFree(rbuf2));
815:   PetscCall(PetscFree(s_waits1));
816:   PetscCall(PetscFree(r_waits1));
817:   PetscCall(PetscFree(s_waits2));
818:   PetscCall(PetscFree(r_waits2));
819:   PetscCall(PetscFree(recv_status));
820:   if (xdata) {
821:     PetscCall(PetscFree(xdata[0]));
822:     PetscCall(PetscFree(xdata));
823:   }
824:   PetscCall(PetscFree(isz1));
825: #if defined(PETSC_USE_CTABLE)
826:   for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
827:   PetscCall(PetscFree(table_data));
828:   PetscCall(PetscFree(tdata));
829:   PetscCall(PetscFree4(table, data, isz, t_p));
830: #else
831:   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
832: #endif
833:   PetscFunctionReturn(PETSC_SUCCESS);
834: }

836: /*
837:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
838:        the work on the local processor.

840:      Inputs:
841:       C      - MAT_MPIAIJ;
842:       imax - total no of index sets processed at a time;
843:       table  - an array of char - size = m bits.

845:      Output:
846:       isz    - array containing the count of the solution elements corresponding
847:                to each index set;
848:       data or table_data  - pointer to the solutions
849: */
850: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
851: {
852:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
853:   Mat         A = c->A, B = c->B;
854:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
855:   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
856:   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
857:   PetscBT     table_i;
858: #if defined(PETSC_USE_CTABLE)
859:   PetscHMapI    table_data_i;
860:   PetscHashIter tpos;
861:   PetscInt      tcount, *tdata;
862: #else
863:   PetscInt *data_i;
864: #endif

866:   PetscFunctionBegin;
867:   rstart = C->rmap->rstart;
868:   cstart = C->cmap->rstart;
869:   ai     = a->i;
870:   aj     = a->j;
871:   bi     = b->i;
872:   bj     = b->j;
873:   garray = c->garray;

875:   for (i = 0; i < imax; i++) {
876: #if defined(PETSC_USE_CTABLE)
877:     /* copy existing entries of table_data_i into tdata[] */
878:     table_data_i = table_data[i];
879:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
880:     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);

882:     PetscCall(PetscMalloc1(tcount, &tdata));
883:     PetscHashIterBegin(table_data_i, tpos);
884:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
885:       PetscHashIterGetKey(table_data_i, tpos, row);
886:       PetscHashIterGetVal(table_data_i, tpos, j);
887:       PetscHashIterNext(table_data_i, tpos);
888:       tdata[--j] = --row;
889:       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
890:     }
891: #else
892:     data_i = data[i];
893: #endif
894:     table_i = table[i];
895:     isz_i   = isz[i];
896:     max     = isz[i];

898:     for (j = 0; j < max; j++) {
899: #if defined(PETSC_USE_CTABLE)
900:       row = tdata[j] - rstart;
901: #else
902:       row = data_i[j] - rstart;
903: #endif
904:       start = ai[row];
905:       end   = ai[row + 1];
906:       for (k = start; k < end; k++) { /* Amat */
907:         val = aj[k] + cstart;
908:         if (!PetscBTLookupSet(table_i, val)) {
909: #if defined(PETSC_USE_CTABLE)
910:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
911: #else
912:           data_i[isz_i] = val;
913: #endif
914:           isz_i++;
915:         }
916:       }
917:       start = bi[row];
918:       end   = bi[row + 1];
919:       for (k = start; k < end; k++) { /* Bmat */
920:         val = garray[bj[k]];
921:         if (!PetscBTLookupSet(table_i, val)) {
922: #if defined(PETSC_USE_CTABLE)
923:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
924: #else
925:           data_i[isz_i] = val;
926: #endif
927:           isz_i++;
928:         }
929:       }
930:     }
931:     isz[i] = isz_i;

933: #if defined(PETSC_USE_CTABLE)
934:     PetscCall(PetscFree(tdata));
935: #endif
936:   }
937:   PetscFunctionReturn(PETSC_SUCCESS);
938: }

940: /*
941:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
942:          and return the output

944:          Input:
945:            C    - the matrix
946:            nrqr - no of messages being processed.
947:            rbuf - an array of pointers to the received requests

949:          Output:
950:            xdata - array of messages to be sent back
951:            isz1  - size of each message

953:   For better efficiency perhaps we should malloc separately each xdata[i],
954: then if a remalloc is required we need only copy the data for that one row
955: rather than all previous rows as it is now where a single large chunk of
956: memory is used.

958: */
959: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
960: {
961:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
962:   Mat         A = c->A, B = c->B;
963:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
964:   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
965:   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
966:   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
967:   PetscInt   *rbuf_i, kmax, rbuf_0;
968:   PetscBT     xtable;

970:   PetscFunctionBegin;
971:   m      = C->rmap->N;
972:   rstart = C->rmap->rstart;
973:   cstart = C->cmap->rstart;
974:   ai     = a->i;
975:   aj     = a->j;
976:   bi     = b->i;
977:   bj     = b->j;
978:   garray = c->garray;

980:   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
981:     rbuf_i = rbuf[i];
982:     rbuf_0 = rbuf_i[0];
983:     ct += rbuf_0;
984:     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
985:   }

987:   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
988:   else max1 = 1;
989:   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
990:   if (nrqr) {
991:     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
992:     ++no_malloc;
993:   }
994:   PetscCall(PetscBTCreate(m, &xtable));
995:   PetscCall(PetscArrayzero(isz1, nrqr));

997:   ct3 = 0;
998:   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
999:     rbuf_i = rbuf[i];
1000:     rbuf_0 = rbuf_i[0];
1001:     ct1    = 2 * rbuf_0 + 1;
1002:     ct2    = ct1;
1003:     ct3 += ct1;
1004:     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1005:       PetscCall(PetscBTMemzero(m, xtable));
1006:       oct2 = ct2;
1007:       kmax = rbuf_i[2 * j];
1008:       for (k = 0; k < kmax; k++, ct1++) {
1009:         row = rbuf_i[ct1];
1010:         if (!PetscBTLookupSet(xtable, row)) {
1011:           if (!(ct3 < mem_estimate)) {
1012:             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1013:             PetscCall(PetscMalloc1(new_estimate, &tmp));
1014:             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1015:             PetscCall(PetscFree(xdata[0]));
1016:             xdata[0]     = tmp;
1017:             mem_estimate = new_estimate;
1018:             ++no_malloc;
1019:             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1020:           }
1021:           xdata[i][ct2++] = row;
1022:           ct3++;
1023:         }
1024:       }
1025:       for (k = oct2, max2 = ct2; k < max2; k++) {
1026:         row   = xdata[i][k] - rstart;
1027:         start = ai[row];
1028:         end   = ai[row + 1];
1029:         for (l = start; l < end; l++) {
1030:           val = aj[l] + cstart;
1031:           if (!PetscBTLookupSet(xtable, val)) {
1032:             if (!(ct3 < mem_estimate)) {
1033:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1034:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1035:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1036:               PetscCall(PetscFree(xdata[0]));
1037:               xdata[0]     = tmp;
1038:               mem_estimate = new_estimate;
1039:               ++no_malloc;
1040:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1041:             }
1042:             xdata[i][ct2++] = val;
1043:             ct3++;
1044:           }
1045:         }
1046:         start = bi[row];
1047:         end   = bi[row + 1];
1048:         for (l = start; l < end; l++) {
1049:           val = garray[bj[l]];
1050:           if (!PetscBTLookupSet(xtable, val)) {
1051:             if (!(ct3 < mem_estimate)) {
1052:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1053:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1054:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1055:               PetscCall(PetscFree(xdata[0]));
1056:               xdata[0]     = tmp;
1057:               mem_estimate = new_estimate;
1058:               ++no_malloc;
1059:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1060:             }
1061:             xdata[i][ct2++] = val;
1062:             ct3++;
1063:           }
1064:         }
1065:       }
1066:       /* Update the header*/
1067:       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1068:       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1069:     }
1070:     xdata[i][0] = rbuf_0;
1071:     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1072:     isz1[i] = ct2; /* size of each message */
1073:   }
1074:   PetscCall(PetscBTDestroy(&xtable));
1075:   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1076:   PetscFunctionReturn(PETSC_SUCCESS);
1077: }

1079: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1080: /*
1081:     Every processor gets the entire matrix
1082: */
1083: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1084: {
1085:   Mat         B;
1086:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1087:   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1088:   PetscMPIInt size, rank;
1089:   PetscInt    sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1090:   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;

1092:   PetscFunctionBegin;
1093:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1094:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1095:   if (scall == MAT_INITIAL_MATRIX) {
1096:     /* Tell every processor the number of nonzeros per row */
1097:     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1098:     for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];

1100:     /* All MPI processes get the same matrix */
1101:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));

1103:     /*     Create the sequential matrix of the same type as the local block diagonal  */
1104:     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1105:     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1106:     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1107:     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1108:     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1109:     PetscCall(PetscCalloc1(2, Bin));
1110:     **Bin = B;
1111:     b     = (Mat_SeqAIJ *)B->data;

1113:     /* zero column space */
1114:     nrecv = 0;
1115:     for (PetscMPIInt i = 0; i < size; i++) {
1116:       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1117:     }
1118:     PetscCall(PetscArrayzero(b->j, nrecv));

1120:     /*   Copy my part of matrix column indices over    */
1121:     sendcount  = ad->nz + bd->nz;
1122:     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1123:     a_jsendbuf = ad->j;
1124:     b_jsendbuf = bd->j;
1125:     n          = A->rmap->rend - A->rmap->rstart;
1126:     cnt        = 0;
1127:     for (PetscInt i = 0; i < n; i++) {
1128:       /* put in lower diagonal portion */
1129:       m = bd->i[i + 1] - bd->i[i];
1130:       while (m > 0) {
1131:         /* is it above diagonal (in bd (compressed) numbering) */
1132:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1133:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1134:         m--;
1135:       }

1137:       /* put in diagonal portion */
1138:       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;

1140:       /* put in upper diagonal portion */
1141:       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1142:     }
1143:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1145:     /* send column indices, b->j was zeroed */
1146:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));

1148:     /*  Assemble the matrix into useable form (numerical values not yet set) */
1149:     /* set the b->ilen (length of each row) values */
1150:     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1151:     /* set the b->i indices */
1152:     b->i[0] = 0;
1153:     for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1154:     PetscCall(PetscFree(lens));
1155:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1156:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));

1158:   } else {
1159:     B = **Bin;
1160:     b = (Mat_SeqAIJ *)B->data;
1161:   }

1163:   /* Copy my part of matrix numerical values into the values location */
1164:   if (flag == MAT_GET_VALUES) {
1165:     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1166:     MatScalar         *sendbuf;

1168:     /* initialize b->a */
1169:     PetscCall(PetscArrayzero(b->a, b->nz));

1171:     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172:     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173:     sendcount = ad->nz + bd->nz;
1174:     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1175:     a_sendbuf = ada;
1176:     b_sendbuf = bda;
1177:     b_sendj   = bd->j;
1178:     n         = A->rmap->rend - A->rmap->rstart;
1179:     cnt       = 0;
1180:     for (PetscInt i = 0; i < n; i++) {
1181:       /* put in lower diagonal portion */
1182:       m = bd->i[i + 1] - bd->i[i];
1183:       while (m > 0) {
1184:         /* is it above diagonal (in bd (compressed) numbering) */
1185:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186:         sendbuf[cnt++] = *b_sendbuf++;
1187:         m--;
1188:         b_sendj++;
1189:       }

1191:       /* put in diagonal portion */
1192:       for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;

1194:       /* put in upper diagonal portion */
1195:       while (m-- > 0) {
1196:         sendbuf[cnt++] = *b_sendbuf++;
1197:         b_sendj++;
1198:       }
1199:     }
1200:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1202:     /* send values, b->a was zeroed */
1203:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));

1205:     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1206:     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1207:   } /* endof (flag == MAT_GET_VALUES) */

1209:   PetscCall(MatPropagateSymmetryOptions(A, B));
1210:   PetscFunctionReturn(PETSC_SUCCESS);
1211: }

1213: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1214: {
1215:   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1216:   Mat             submat, A = c->A, B = c->B;
1217:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1218:   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1219:   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1220:   const PetscInt *icol, *irow;
1221:   PetscInt        nrow, ncol, start;
1222:   PetscMPIInt     rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa;
1223:   PetscInt      **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row;
1224:   PetscInt        msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr;
1225:   PetscInt      **rbuf3, **sbuf_aj, **rbuf2, max1, nnz;
1226:   PetscInt       *lens, rmax, ncols, *cols, Crow;
1227: #if defined(PETSC_USE_CTABLE)
1228:   PetscHMapI cmap, rmap;
1229:   PetscInt  *cmap_loc, *rmap_loc;
1230: #else
1231:   PetscInt *cmap, *rmap;
1232: #endif
1233:   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt;
1234:   PetscInt     *cworkB, lwrite, *subcols, ib, jb;
1235:   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1236:   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1237:   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1238:   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1239:   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1240:   MPI_Comm      comm;
1241:   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1242:   PetscMPIInt  *onodes1, *olengths1, idex, end, *row2proc;
1243:   Mat_SubSppt  *smatis1;
1244:   PetscBool     isrowsorted, iscolsorted;

1246:   PetscFunctionBegin;
1249:   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1250:   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1251:   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1252:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1253:   size = c->size;
1254:   rank = c->rank;

1256:   PetscCall(ISSorted(iscol[0], &iscolsorted));
1257:   PetscCall(ISSorted(isrow[0], &isrowsorted));
1258:   PetscCall(ISGetIndices(isrow[0], &irow));
1259:   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1260:   if (allcolumns) {
1261:     icol = NULL;
1262:     ncol = C->cmap->N;
1263:   } else {
1264:     PetscCall(ISGetIndices(iscol[0], &icol));
1265:     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1266:   }

1268:   if (scall == MAT_INITIAL_MATRIX) {
1269:     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;

1271:     /* Get some new tags to keep the communication clean */
1272:     tag1 = ((PetscObject)C)->tag;
1273:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1274:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

1276:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1277:      required. Based on this, buffers are allocated, and data copied into them */
1278:     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1279:     PetscCall(PetscMalloc1(nrow, &row2proc));

1281:     /* w1[proc] = num of rows owned by proc -- to be requested */
1282:     proc = 0;
1283:     nrqs = 0; /* num of outgoing messages */
1284:     for (PetscInt j = 0; j < nrow; j++) {
1285:       row = irow[j];
1286:       if (!isrowsorted) proc = 0;
1287:       while (row >= C->rmap->range[proc + 1]) proc++;
1288:       w1[proc]++;
1289:       row2proc[j] = proc; /* map row index to proc */

1291:       if (proc != rank && !w2[proc]) {
1292:         w2[proc] = 1;
1293:         nrqs++;
1294:       }
1295:     }
1296:     w1[rank] = 0; /* rows owned by self will not be requested */

1298:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1299:     for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) {
1300:       if (w1[proc]) pa[j++] = proc;
1301:     }

1303:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1304:     msz = 0; /* total mesg length (for all procs) */
1305:     for (PetscMPIInt i = 0; i < nrqs; i++) {
1306:       w1[pa[i]] += 3;
1307:       msz += w1[pa[i]];
1308:     }
1309:     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));

1311:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1312:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1313:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));

1315:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1316:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1317:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

1319:     /* Now post the Irecvs corresponding to these messages */
1320:     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

1322:     PetscCall(PetscFree(onodes1));
1323:     PetscCall(PetscFree(olengths1));

1325:     /* Allocate Memory for outgoing messages */
1326:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1327:     PetscCall(PetscArrayzero(sbuf1, size));
1328:     PetscCall(PetscArrayzero(ptr, size));

1330:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1331:     iptr = tmp;
1332:     for (PetscMPIInt i = 0; i < nrqs; i++) {
1333:       sbuf1[pa[i]] = iptr;
1334:       iptr += w1[pa[i]];
1335:     }

1337:     /* Form the outgoing messages */
1338:     /* Initialize the header space */
1339:     for (PetscMPIInt i = 0; i < nrqs; i++) {
1340:       PetscCall(PetscArrayzero(sbuf1[pa[i]], 3));
1341:       ptr[pa[i]] = sbuf1[pa[i]] + 3;
1342:     }

1344:     /* Parse the isrow and copy data into outbuf */
1345:     PetscCall(PetscArrayzero(ctr, size));
1346:     for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */
1347:       if (row2proc[j] != rank) {          /* copy to the outgoing buf */
1348:         *ptr[row2proc[j]] = irow[j];
1349:         ctr[row2proc[j]]++;
1350:         ptr[row2proc[j]]++;
1351:       }
1352:     }

1354:     /* Update the headers for the current IS */
1355:     for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
1356:       if ((ctr_j = ctr[j])) {
1357:         sbuf1_j            = sbuf1[j];
1358:         k                  = ++sbuf1_j[0];
1359:         sbuf1_j[2 * k]     = ctr_j;
1360:         sbuf1_j[2 * k - 1] = 0;
1361:       }
1362:     }

1364:     /* Now post the sends */
1365:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1366:     for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Isend(sbuf1[pa[i]], w1[pa[i]], MPIU_INT, pa[i], tag1, comm, s_waits1 + i));

1368:     /* Post Receives to capture the buffer size */
1369:     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1370:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));

1372:     if (nrqs) rbuf2[0] = tmp + msz;
1373:     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];

1375:     for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[pa[i]], MPIU_INT, pa[i], tag2, comm, r_waits2 + i));

1377:     PetscCall(PetscFree2(w1, w2));

1379:     /* Send to other procs the buf size they should allocate */
1380:     /* Receive messages*/
1381:     PetscCall(PetscMalloc1(nrqr, &r_status1));
1382:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));

1384:     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1385:     for (PetscMPIInt i = 0; i < nrqr; ++i) {
1386:       req_size[i] = 0;
1387:       rbuf1_i     = rbuf1[i];
1388:       start       = 2 * rbuf1_i[0] + 1;
1389:       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1390:       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1391:       sbuf2_i = sbuf2[i];
1392:       for (PetscInt j = start; j < end; j++) {
1393:         k          = rbuf1_i[j] - rstart;
1394:         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1395:         sbuf2_i[j] = ncols;
1396:         req_size[i] += ncols;
1397:       }
1398:       req_source1[i] = r_status1[i].MPI_SOURCE;

1400:       /* form the header */
1401:       sbuf2_i[0] = req_size[i];
1402:       for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

1404:       PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1405:     }

1407:     PetscCall(PetscFree(r_status1));
1408:     PetscCall(PetscFree(r_waits1));

1410:     /* rbuf2 is received, Post recv column indices a->j */
1411:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));

1413:     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1414:     for (PetscMPIInt i = 0; i < nrqs; ++i) {
1415:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1416:       req_source2[i] = r_status2[i].MPI_SOURCE;
1417:       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1418:     }

1420:     /* Wait on sends1 and sends2 */
1421:     PetscCall(PetscMalloc1(nrqs, &s_status1));
1422:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1423:     PetscCall(PetscFree(s_waits1));
1424:     PetscCall(PetscFree(s_status1));

1426:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1427:     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));

1429:     /* Now allocate sending buffers for a->j, and send them off */
1430:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1431:     jcnt = 0;
1432:     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1433:     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
1434:     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

1436:     for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */
1437:       rbuf1_i   = rbuf1[i];
1438:       sbuf_aj_i = sbuf_aj[i];
1439:       ct1       = 2 * rbuf1_i[0] + 1;
1440:       ct2       = 0;
1441:       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */

1443:       kmax = rbuf1[i][2];
1444:       for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */
1445:         row    = rbuf1_i[ct1] - rstart;
1446:         nzA    = ai[row + 1] - ai[row];
1447:         nzB    = bi[row + 1] - bi[row];
1448:         ncols  = nzA + nzB;
1449:         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1450:         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);

1452:         /* load the column indices for this row into cols*/
1453:         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);

1455:         lwrite = 0;
1456:         for (PetscInt l = 0; l < nzB; l++) {
1457:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1458:         }
1459:         for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1460:         for (PetscInt l = 0; l < nzB; l++) {
1461:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1462:         }

1464:         ct2 += ncols;
1465:       }
1466:       PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1467:     }

1469:     /* create column map (cmap): global col of C -> local col of submat */
1470: #if defined(PETSC_USE_CTABLE)
1471:     if (!allcolumns) {
1472:       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1473:       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1474:       for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1475:         if (icol[j] >= cstart && icol[j] < cend) {
1476:           cmap_loc[icol[j] - cstart] = j + 1;
1477:         } else { /* use PetscHMapI for non-local col indices */
1478:           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1479:         }
1480:       }
1481:     } else {
1482:       cmap     = NULL;
1483:       cmap_loc = NULL;
1484:     }
1485:     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1486: #else
1487:     if (!allcolumns) {
1488:       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1489:       for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1490:     } else {
1491:       cmap = NULL;
1492:     }
1493: #endif

1495:     /* Create lens for MatSeqAIJSetPreallocation() */
1496:     PetscCall(PetscCalloc1(nrow, &lens));

1498:     /* Compute lens from local part of C */
1499:     for (PetscInt j = 0; j < nrow; j++) {
1500:       row = irow[j];
1501:       if (row2proc[j] == rank) {
1502:         /* diagonal part A = c->A */
1503:         ncols = ai[row - rstart + 1] - ai[row - rstart];
1504:         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1505:         if (!allcolumns) {
1506:           for (PetscInt k = 0; k < ncols; k++) {
1507: #if defined(PETSC_USE_CTABLE)
1508:             tcol = cmap_loc[cols[k]];
1509: #else
1510:             tcol = cmap[cols[k] + cstart];
1511: #endif
1512:             if (tcol) lens[j]++;
1513:           }
1514:         } else { /* allcolumns */
1515:           lens[j] = ncols;
1516:         }

1518:         /* off-diagonal part B = c->B */
1519:         ncols = bi[row - rstart + 1] - bi[row - rstart];
1520:         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1521:         if (!allcolumns) {
1522:           for (PetscInt k = 0; k < ncols; k++) {
1523: #if defined(PETSC_USE_CTABLE)
1524:             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1525: #else
1526:             tcol = cmap[bmap[cols[k]]];
1527: #endif
1528:             if (tcol) lens[j]++;
1529:           }
1530:         } else { /* allcolumns */
1531:           lens[j] += ncols;
1532:         }
1533:       }
1534:     }

1536:     /* Create row map (rmap): global row of C -> local row of submat */
1537: #if defined(PETSC_USE_CTABLE)
1538:     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1539:     for (PetscInt j = 0; j < nrow; j++) {
1540:       row = irow[j];
1541:       if (row2proc[j] == rank) { /* a local row */
1542:         rmap_loc[row - rstart] = j;
1543:       } else {
1544:         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1545:       }
1546:     }
1547: #else
1548:     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1549:     for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j;
1550: #endif

1552:     /* Update lens from offproc data */
1553:     /* recv a->j is done */
1554:     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1555:     for (PetscMPIInt i = 0; i < nrqs; i++) {
1556:       sbuf1_i = sbuf1[pa[i]];
1557:       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1558:       ct1     = 2 + 1;
1559:       ct2     = 0;
1560:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1561:       rbuf3_i = rbuf3[i]; /* received C->j */

1563:       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1564:       max1 = sbuf1_i[2];
1565:       for (PetscInt k = 0; k < max1; k++, ct1++) {
1566: #if defined(PETSC_USE_CTABLE)
1567:         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1568:         row--;
1569:         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1570: #else
1571:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1572: #endif
1573:         /* Now, store row index of submat in sbuf1_i[ct1] */
1574:         sbuf1_i[ct1] = row;

1576:         nnz = rbuf2_i[ct1];
1577:         if (!allcolumns) {
1578:           for (PetscMPIInt l = 0; l < nnz; l++, ct2++) {
1579: #if defined(PETSC_USE_CTABLE)
1580:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1581:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1582:             } else {
1583:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1584:             }
1585: #else
1586:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1587: #endif
1588:             if (tcol) lens[row]++;
1589:           }
1590:         } else { /* allcolumns */
1591:           lens[row] += nnz;
1592:         }
1593:       }
1594:     }
1595:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1596:     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));

1598:     /* Create the submatrices */
1599:     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1600:     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));

1602:     PetscCall(ISGetBlockSize(isrow[0], &ib));
1603:     PetscCall(ISGetBlockSize(iscol[0], &jb));
1604:     if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb));
1605:     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1606:     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));

1608:     /* create struct Mat_SubSppt and attached it to submat */
1609:     PetscCall(PetscNew(&smatis1));
1610:     subc            = (Mat_SeqAIJ *)submat->data;
1611:     subc->submatis1 = smatis1;

1613:     smatis1->id          = 0;
1614:     smatis1->nrqs        = nrqs;
1615:     smatis1->nrqr        = nrqr;
1616:     smatis1->rbuf1       = rbuf1;
1617:     smatis1->rbuf2       = rbuf2;
1618:     smatis1->rbuf3       = rbuf3;
1619:     smatis1->sbuf2       = sbuf2;
1620:     smatis1->req_source2 = req_source2;

1622:     smatis1->sbuf1 = sbuf1;
1623:     smatis1->ptr   = ptr;
1624:     smatis1->tmp   = tmp;
1625:     smatis1->ctr   = ctr;

1627:     smatis1->pa          = pa;
1628:     smatis1->req_size    = req_size;
1629:     smatis1->req_source1 = req_source1;

1631:     smatis1->allcolumns = allcolumns;
1632:     smatis1->singleis   = PETSC_TRUE;
1633:     smatis1->row2proc   = row2proc;
1634:     smatis1->rmap       = rmap;
1635:     smatis1->cmap       = cmap;
1636: #if defined(PETSC_USE_CTABLE)
1637:     smatis1->rmap_loc = rmap_loc;
1638:     smatis1->cmap_loc = cmap_loc;
1639: #endif

1641:     smatis1->destroy     = submat->ops->destroy;
1642:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1643:     submat->factortype   = C->factortype;

1645:     /* compute rmax */
1646:     rmax = 0;
1647:     for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);

1649:   } else { /* scall == MAT_REUSE_MATRIX */
1650:     submat = submats[0];
1651:     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

1653:     subc        = (Mat_SeqAIJ *)submat->data;
1654:     rmax        = subc->rmax;
1655:     smatis1     = subc->submatis1;
1656:     nrqs        = smatis1->nrqs;
1657:     nrqr        = smatis1->nrqr;
1658:     rbuf1       = smatis1->rbuf1;
1659:     rbuf2       = smatis1->rbuf2;
1660:     rbuf3       = smatis1->rbuf3;
1661:     req_source2 = smatis1->req_source2;

1663:     sbuf1 = smatis1->sbuf1;
1664:     sbuf2 = smatis1->sbuf2;
1665:     ptr   = smatis1->ptr;
1666:     tmp   = smatis1->tmp;
1667:     ctr   = smatis1->ctr;

1669:     pa          = smatis1->pa;
1670:     req_size    = smatis1->req_size;
1671:     req_source1 = smatis1->req_source1;

1673:     allcolumns = smatis1->allcolumns;
1674:     row2proc   = smatis1->row2proc;
1675:     rmap       = smatis1->rmap;
1676:     cmap       = smatis1->cmap;
1677: #if defined(PETSC_USE_CTABLE)
1678:     rmap_loc = smatis1->rmap_loc;
1679:     cmap_loc = smatis1->cmap_loc;
1680: #endif
1681:   }

1683:   /* Post recv matrix values */
1684:   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1685:   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1686:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1687:   for (PetscMPIInt i = 0; i < nrqs; ++i) {
1688:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1689:     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1690:   }

1692:   /* Allocate sending buffers for a->a, and send them off */
1693:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1694:   jcnt = 0;
1695:   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
1696:   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
1697:   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

1699:   for (PetscMPIInt i = 0; i < nrqr; i++) {
1700:     rbuf1_i   = rbuf1[i];
1701:     sbuf_aa_i = sbuf_aa[i];
1702:     ct1       = 2 * rbuf1_i[0] + 1;
1703:     ct2       = 0;
1704:     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1706:     kmax = rbuf1_i[2];
1707:     for (PetscInt k = 0; k < kmax; k++, ct1++) {
1708:       row    = rbuf1_i[ct1] - rstart;
1709:       nzA    = ai[row + 1] - ai[row];
1710:       nzB    = bi[row + 1] - bi[row];
1711:       ncols  = nzA + nzB;
1712:       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1713:       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1714:       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);

1716:       /* load the column values for this row into vals*/
1717:       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);

1719:       lwrite = 0;
1720:       for (PetscInt l = 0; l < nzB; l++) {
1721:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1722:       }
1723:       for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1724:       for (PetscInt l = 0; l < nzB; l++) {
1725:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1726:       }

1728:       ct2 += ncols;
1729:     }
1730:     PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1731:   }

1733:   /* Assemble submat */
1734:   /* First assemble the local rows */
1735:   for (PetscInt j = 0; j < nrow; j++) {
1736:     row = irow[j];
1737:     if (row2proc[j] == rank) {
1738:       Crow = row - rstart; /* local row index of C */
1739: #if defined(PETSC_USE_CTABLE)
1740:       row = rmap_loc[Crow]; /* row index of submat */
1741: #else
1742:       row = rmap[row];
1743: #endif

1745:       if (allcolumns) {
1746:         PetscInt ncol = 0;

1748:         /* diagonal part A = c->A */
1749:         ncols = ai[Crow + 1] - ai[Crow];
1750:         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1751:         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1752:         for (PetscInt k = 0; k < ncols; k++) {
1753:           subcols[ncol]   = cols[k] + cstart;
1754:           subvals[ncol++] = vals[k];
1755:         }

1757:         /* off-diagonal part B = c->B */
1758:         ncols = bi[Crow + 1] - bi[Crow];
1759:         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1760:         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1761:         for (PetscInt k = 0; k < ncols; k++) {
1762:           subcols[ncol]   = bmap[cols[k]];
1763:           subvals[ncol++] = vals[k];
1764:         }

1766:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));

1768:       } else { /* !allcolumns */
1769:         PetscInt ncol = 0;

1771: #if defined(PETSC_USE_CTABLE)
1772:         /* diagonal part A = c->A */
1773:         ncols = ai[Crow + 1] - ai[Crow];
1774:         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1775:         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1776:         for (PetscInt k = 0; k < ncols; k++) {
1777:           tcol = cmap_loc[cols[k]];
1778:           if (tcol) {
1779:             subcols[ncol]   = --tcol;
1780:             subvals[ncol++] = vals[k];
1781:           }
1782:         }

1784:         /* off-diagonal part B = c->B */
1785:         ncols = bi[Crow + 1] - bi[Crow];
1786:         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1787:         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1788:         for (PetscInt k = 0; k < ncols; k++) {
1789:           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1790:           if (tcol) {
1791:             subcols[ncol]   = --tcol;
1792:             subvals[ncol++] = vals[k];
1793:           }
1794:         }
1795: #else
1796:         /* diagonal part A = c->A */
1797:         ncols = ai[Crow + 1] - ai[Crow];
1798:         cols  = aj + ai[Crow];
1799:         vals  = a_a + ai[Crow];
1800:         for (PetscInt k = 0; k < ncols; k++) {
1801:           tcol = cmap[cols[k] + cstart];
1802:           if (tcol) {
1803:             subcols[ncol]   = --tcol;
1804:             subvals[ncol++] = vals[k];
1805:           }
1806:         }

1808:         /* off-diagonal part B = c->B */
1809:         ncols = bi[Crow + 1] - bi[Crow];
1810:         cols  = bj + bi[Crow];
1811:         vals  = b_a + bi[Crow];
1812:         for (PetscInt k = 0; k < ncols; k++) {
1813:           tcol = cmap[bmap[cols[k]]];
1814:           if (tcol) {
1815:             subcols[ncol]   = --tcol;
1816:             subvals[ncol++] = vals[k];
1817:           }
1818:         }
1819: #endif
1820:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES));
1821:       }
1822:     }
1823:   }

1825:   /* Now assemble the off-proc rows */
1826:   for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */
1827:     /* recv values from other processes */
1828:     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1829:     sbuf1_i = sbuf1[pa[idex]];
1830:     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1831:     ct1     = 2 + 1;
1832:     ct2     = 0;           /* count of received C->j */
1833:     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1834:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1835:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1836:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1838:     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1839:     max1 = sbuf1_i[2];                           /* num of rows */
1840:     for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */
1841:       row = sbuf1_i[ct1];                        /* row index of submat */
1842:       if (!allcolumns) {
1843:         idex = 0;
1844:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1845:           nnz = rbuf2_i[ct1];                         /* num of C entries in this row */
1846:           for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1847: #if defined(PETSC_USE_CTABLE)
1848:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1849:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1850:             } else {
1851:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1852:             }
1853: #else
1854:             tcol = cmap[rbuf3_i[ct2]];
1855: #endif
1856:             if (tcol) {
1857:               subcols[idex]   = --tcol; /* may not be sorted */
1858:               subvals[idex++] = rbuf4_i[ct2];

1860:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1861:                For reuse, we replace received C->j with index that should be inserted to submat */
1862:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1863:             }
1864:           }
1865:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1866:         } else { /* scall == MAT_REUSE_MATRIX */
1867:           submat = submats[0];
1868:           subc   = (Mat_SeqAIJ *)submat->data;

1870:           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1871:           for (PetscInt l = 0; l < nnz; l++) {
1872:             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1873:             subvals[idex++] = rbuf4_i[ct2];
1874:           }

1876:           bj = subc->j + subc->i[row]; /* sorted column indices */
1877:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1878:         }
1879:       } else {              /* allcolumns */
1880:         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1881:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1882:         ct2 += nnz;
1883:       }
1884:     }
1885:   }

1887:   /* sending a->a are done */
1888:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1889:   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));

1891:   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1892:   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1893:   submats[0] = submat;

1895:   /* Restore the indices */
1896:   PetscCall(ISRestoreIndices(isrow[0], &irow));
1897:   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));

1899:   /* Destroy allocated memory */
1900:   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1901:   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1902:   if (sbuf_aa) {
1903:     PetscCall(PetscFree(sbuf_aa[0]));
1904:     PetscCall(PetscFree(sbuf_aa));
1905:   }

1907:   if (scall == MAT_INITIAL_MATRIX) {
1908:     PetscCall(PetscFree(lens));
1909:     if (sbuf_aj) {
1910:       PetscCall(PetscFree(sbuf_aj[0]));
1911:       PetscCall(PetscFree(sbuf_aj));
1912:     }
1913:   }
1914:   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1915:   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1916:   PetscFunctionReturn(PETSC_SUCCESS);
1917: }

1919: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1920: {
1921:   PetscInt  ncol;
1922:   PetscBool colflag, allcolumns = PETSC_FALSE;

1924:   PetscFunctionBegin;
1925:   /* Allocate memory to hold all the submatrices */
1926:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));

1928:   /* Check for special case: each processor gets entire matrix columns */
1929:   PetscCall(ISIdentity(iscol[0], &colflag));
1930:   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1931:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1933:   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1934:   PetscFunctionReturn(PETSC_SUCCESS);
1935: }

1937: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1938: {
1939:   PetscInt     nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2];
1940:   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1941:   Mat_SeqAIJ  *subc;
1942:   Mat_SubSppt *smat;

1944:   PetscFunctionBegin;
1945:   /* Check for special case: each processor has a single IS */
1946:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1947:     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1948:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1949:     PetscFunctionReturn(PETSC_SUCCESS);
1950:   }

1952:   /* Collect global wantallmatrix and nstages */
1953:   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1954:   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1955:   if (!nmax) nmax = 1;

1957:   if (scall == MAT_INITIAL_MATRIX) {
1958:     /* Collect global wantallmatrix and nstages */
1959:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1960:       PetscCall(ISIdentity(*isrow, &rowflag));
1961:       PetscCall(ISIdentity(*iscol, &colflag));
1962:       PetscCall(ISGetLocalSize(*isrow, &nrow));
1963:       PetscCall(ISGetLocalSize(*iscol, &ncol));
1964:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1965:         wantallmatrix = PETSC_TRUE;

1967:         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1968:       }
1969:     }

1971:     /* Determine the number of stages through which submatrices are done
1972:        Each stage will extract nmax submatrices.
1973:        nmax is determined by the matrix column dimension.
1974:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1975:     */
1976:     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1978:     in[0] = -1 * (PetscInt)wantallmatrix;
1979:     in[1] = nstages;
1980:     PetscCallMPI(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1981:     wantallmatrix = (PetscBool)(-out[0]);
1982:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

1984:   } else { /* MAT_REUSE_MATRIX */
1985:     if (ismax) {
1986:       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
1987:       smat = subc->submatis1;
1988:     } else { /* (*submat)[0] is a dummy matrix */
1989:       smat = (Mat_SubSppt *)(*submat)[0]->data;
1990:     }
1991:     if (!smat) {
1992:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1993:       wantallmatrix = PETSC_TRUE;
1994:     } else if (smat->singleis) {
1995:       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1996:       PetscFunctionReturn(PETSC_SUCCESS);
1997:     } else {
1998:       nstages = smat->nstages;
1999:     }
2000:   }

2002:   if (wantallmatrix) {
2003:     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2004:     PetscFunctionReturn(PETSC_SUCCESS);
2005:   }

2007:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2008:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));

2010:   for (PetscInt i = 0, pos = 0; i < nstages; i++) {
2011:     if (pos + nmax <= ismax) max_no = nmax;
2012:     else if (pos >= ismax) max_no = 0;
2013:     else max_no = ismax - pos;

2015:     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2016:     if (!max_no) {
2017:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2018:         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2019:         smat->nstages = nstages;
2020:       }
2021:       pos++; /* advance to next dummy matrix if any */
2022:     } else pos += max_no;
2023:   }

2025:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2026:     /* save nstages for reuse */
2027:     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2028:     smat          = subc->submatis1;
2029:     smat->nstages = nstages;
2030:   }
2031:   PetscFunctionReturn(PETSC_SUCCESS);
2032: }

2034: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2035: {
2036:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2037:   Mat              A = c->A;
2038:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2039:   const PetscInt **icol, **irow;
2040:   PetscInt        *nrow, *ncol, start;
2041:   PetscMPIInt      nrqs = 0, *pa, proc = -1;
2042:   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2;
2043:   PetscInt       **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row;
2044:   PetscInt         msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol;
2045:   PetscInt       **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2046:   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2047: #if defined(PETSC_USE_CTABLE)
2048:   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2049: #else
2050:   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2051: #endif
2052:   const PetscInt *irow_i;
2053:   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2054:   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2055:   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2056:   MPI_Comm        comm;
2057:   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2058:   PetscMPIInt    *onodes1, *olengths1, end, **row2proc, *row2proc_i;
2059:   PetscInt        ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2060:   Mat_SubSppt    *smat_i;
2061:   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2062:   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt;

2064:   PetscFunctionBegin;
2065:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2066:   size = c->size;
2067:   rank = c->rank;

2069:   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2070:   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));

2072:   for (PetscInt i = 0; i < ismax; i++) {
2073:     PetscCall(ISSorted(iscol[i], &issorted[i]));
2074:     if (!issorted[i]) iscsorted = issorted[i];

2076:     PetscCall(ISSorted(isrow[i], &issorted[i]));

2078:     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2079:     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));

2081:     /* Check for special case: allcolumn */
2082:     PetscCall(ISIdentity(iscol[i], &colflag));
2083:     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2084:     if (colflag && ncol[i] == C->cmap->N) {
2085:       allcolumns[i] = PETSC_TRUE;
2086:       icol[i]       = NULL;
2087:     } else {
2088:       allcolumns[i] = PETSC_FALSE;
2089:       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2090:     }
2091:   }

2093:   if (scall == MAT_REUSE_MATRIX) {
2094:     /* Assumes new rows are same length as the old rows */
2095:     for (PetscInt i = 0; i < ismax; i++) {
2096:       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2097:       subc = (Mat_SeqAIJ *)submats[i]->data;
2098:       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

2100:       /* Initial matrix as if empty */
2101:       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));

2103:       smat_i = subc->submatis1;

2105:       nrqs        = smat_i->nrqs;
2106:       nrqr        = smat_i->nrqr;
2107:       rbuf1       = smat_i->rbuf1;
2108:       rbuf2       = smat_i->rbuf2;
2109:       rbuf3       = smat_i->rbuf3;
2110:       req_source2 = smat_i->req_source2;

2112:       sbuf1 = smat_i->sbuf1;
2113:       sbuf2 = smat_i->sbuf2;
2114:       ptr   = smat_i->ptr;
2115:       tmp   = smat_i->tmp;
2116:       ctr   = smat_i->ctr;

2118:       pa          = smat_i->pa;
2119:       req_size    = smat_i->req_size;
2120:       req_source1 = smat_i->req_source1;

2122:       allcolumns[i] = smat_i->allcolumns;
2123:       row2proc[i]   = smat_i->row2proc;
2124:       rmap[i]       = smat_i->rmap;
2125:       cmap[i]       = smat_i->cmap;
2126:     }

2128:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2129:       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2130:       smat_i = (Mat_SubSppt *)submats[0]->data;

2132:       nrqs        = smat_i->nrqs;
2133:       nrqr        = smat_i->nrqr;
2134:       rbuf1       = smat_i->rbuf1;
2135:       rbuf2       = smat_i->rbuf2;
2136:       rbuf3       = smat_i->rbuf3;
2137:       req_source2 = smat_i->req_source2;

2139:       sbuf1 = smat_i->sbuf1;
2140:       sbuf2 = smat_i->sbuf2;
2141:       ptr   = smat_i->ptr;
2142:       tmp   = smat_i->tmp;
2143:       ctr   = smat_i->ctr;

2145:       pa          = smat_i->pa;
2146:       req_size    = smat_i->req_size;
2147:       req_source1 = smat_i->req_source1;

2149:       allcolumns[0] = PETSC_FALSE;
2150:     }
2151:   } else { /* scall == MAT_INITIAL_MATRIX */
2152:     /* Get some new tags to keep the communication clean */
2153:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2154:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

2156:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2157:      required. Based on this, buffers are allocated, and data copied into them*/
2158:     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */

2160:     for (PetscInt i = 0; i < ismax; i++) {
2161:       jmax   = nrow[i];
2162:       irow_i = irow[i];

2164:       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2165:       row2proc[i] = row2proc_i;

2167:       if (issorted[i]) proc = 0;
2168:       for (PetscInt j = 0; j < jmax; j++) {
2169:         if (!issorted[i]) proc = 0;
2170:         row = irow_i[j];
2171:         while (row >= C->rmap->range[proc + 1]) proc++;
2172:         w4[proc]++;
2173:         row2proc_i[j] = proc; /* map row index to proc */
2174:       }
2175:       for (PetscMPIInt j = 0; j < size; j++) {
2176:         if (w4[j]) {
2177:           w1[j] += w4[j];
2178:           w3[j]++;
2179:           w4[j] = 0;
2180:         }
2181:       }
2182:     }

2184:     nrqs     = 0; /* no of outgoing messages */
2185:     msz      = 0; /* total mesg length (for all procs) */
2186:     w1[rank] = 0; /* no mesg sent to self */
2187:     w3[rank] = 0;
2188:     for (PetscMPIInt i = 0; i < size; i++) {
2189:       if (w1[i]) {
2190:         w2[i] = 1;
2191:         nrqs++;
2192:       } /* there exists a message to proc i */
2193:     }
2194:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2195:     for (PetscMPIInt i = 0, j = 0; i < size; i++) {
2196:       if (w1[i]) {
2197:         pa[j] = i;
2198:         j++;
2199:       }
2200:     }

2202:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2203:     for (PetscMPIInt i = 0; i < nrqs; i++) {
2204:       PetscMPIInt j = pa[i];
2205:       w1[j] += w2[j] + 2 * w3[j];
2206:       msz += w1[j];
2207:     }
2208:     PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz));

2210:     /* Determine the number of messages to expect, their lengths, from from-ids */
2211:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2212:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

2214:     /* Now post the Irecvs corresponding to these messages */
2215:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2216:     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

2218:     /* Allocate Memory for outgoing messages */
2219:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2220:     PetscCall(PetscArrayzero(sbuf1, size));
2221:     PetscCall(PetscArrayzero(ptr, size));

2223:     {
2224:       PetscInt *iptr = tmp;
2225:       k              = 0;
2226:       for (PetscMPIInt i = 0; i < nrqs; i++) {
2227:         PetscMPIInt j = pa[i];
2228:         iptr += k;
2229:         sbuf1[j] = iptr;
2230:         k        = w1[j];
2231:       }
2232:     }

2234:     /* Form the outgoing messages. Initialize the header space */
2235:     for (PetscMPIInt i = 0; i < nrqs; i++) {
2236:       PetscMPIInt j = pa[i];
2237:       sbuf1[j][0]   = 0;
2238:       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2239:       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2240:     }

2242:     /* Parse the isrow and copy data into outbuf */
2243:     for (PetscInt i = 0; i < ismax; i++) {
2244:       row2proc_i = row2proc[i];
2245:       PetscCall(PetscArrayzero(ctr, size));
2246:       irow_i = irow[i];
2247:       jmax   = nrow[i];
2248:       for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */
2249:         proc = row2proc_i[j];
2250:         if (proc != rank) { /* copy to the outgoing buf */
2251:           ctr[proc]++;
2252:           *ptr[proc] = irow_i[j];
2253:           ptr[proc]++;
2254:         }
2255:       }
2256:       /* Update the headers for the current IS */
2257:       for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */
2258:         if ((ctr_j = ctr[j])) {
2259:           sbuf1_j            = sbuf1[j];
2260:           k                  = ++sbuf1_j[0];
2261:           sbuf1_j[2 * k]     = ctr_j;
2262:           sbuf1_j[2 * k - 1] = i;
2263:         }
2264:       }
2265:     }

2267:     /*  Now  post the sends */
2268:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2269:     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2270:       PetscMPIInt j = pa[i];
2271:       PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2272:     }

2274:     /* Post Receives to capture the buffer size */
2275:     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2276:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2277:     if (nrqs) rbuf2[0] = tmp + msz;
2278:     for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2279:     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2280:       PetscMPIInt j = pa[i];
2281:       PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2282:     }

2284:     /* Send to other procs the buf size they should allocate */
2285:     /* Receive messages*/
2286:     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2287:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2288:     {
2289:       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2290:       PetscInt *sbuf2_i;

2292:       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2293:       for (PetscMPIInt i = 0; i < nrqr; ++i) {
2294:         req_size[i] = 0;
2295:         rbuf1_i     = rbuf1[i];
2296:         start       = 2 * rbuf1_i[0] + 1;
2297:         end         = olengths1[i];
2298:         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2299:         sbuf2_i = sbuf2[i];
2300:         for (PetscInt j = start; j < end; j++) {
2301:           id         = rbuf1_i[j] - rstart;
2302:           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2303:           sbuf2_i[j] = ncols;
2304:           req_size[i] += ncols;
2305:         }
2306:         req_source1[i] = onodes1[i];
2307:         /* form the header */
2308:         sbuf2_i[0] = req_size[i];
2309:         for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

2311:         PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2312:       }
2313:     }

2315:     PetscCall(PetscFree(onodes1));
2316:     PetscCall(PetscFree(olengths1));
2317:     PetscCall(PetscFree(r_waits1));
2318:     PetscCall(PetscFree4(w1, w2, w3, w4));

2320:     /* Receive messages*/
2321:     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2322:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2323:     for (PetscMPIInt i = 0; i < nrqs; ++i) {
2324:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2325:       req_source2[i] = pa[i];
2326:       PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2327:     }
2328:     PetscCall(PetscFree(r_waits2));

2330:     /* Wait on sends1 and sends2 */
2331:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2332:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2333:     PetscCall(PetscFree(s_waits1));
2334:     PetscCall(PetscFree(s_waits2));

2336:     /* Now allocate sending buffers for a->j, and send them off */
2337:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2338:     jcnt = 0;
2339:     for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2340:     if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0]));
2341:     for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

2343:     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2344:     {
2345:       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2346:       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2347:       PetscInt  cend = C->cmap->rend;
2348:       PetscInt *a_j = a->j, *b_j = b->j, ctmp;

2350:       for (PetscMPIInt i = 0; i < nrqr; i++) {
2351:         rbuf1_i   = rbuf1[i];
2352:         sbuf_aj_i = sbuf_aj[i];
2353:         ct1       = 2 * rbuf1_i[0] + 1;
2354:         ct2       = 0;
2355:         for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2356:           kmax = rbuf1[i][2 * j];
2357:           for (PetscInt k = 0; k < kmax; k++, ct1++) {
2358:             row    = rbuf1_i[ct1] - rstart;
2359:             nzA    = a_i[row + 1] - a_i[row];
2360:             nzB    = b_i[row + 1] - b_i[row];
2361:             ncols  = nzA + nzB;
2362:             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2363:             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);

2365:             /* load the column indices for this row into cols */
2366:             cols = sbuf_aj_i + ct2;

2368:             lwrite = 0;
2369:             for (PetscInt l = 0; l < nzB; l++) {
2370:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2371:             }
2372:             for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2373:             for (PetscInt l = 0; l < nzB; l++) {
2374:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2375:             }

2377:             ct2 += ncols;
2378:           }
2379:         }
2380:         PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2381:       }
2382:     }

2384:     /* create col map: global col of C -> local col of submatrices */
2385:     {
2386:       const PetscInt *icol_i;
2387: #if defined(PETSC_USE_CTABLE)
2388:       for (PetscInt i = 0; i < ismax; i++) {
2389:         if (!allcolumns[i]) {
2390:           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));

2392:           jmax   = ncol[i];
2393:           icol_i = icol[i];
2394:           cmap_i = cmap[i];
2395:           for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2396:         } else cmap[i] = NULL;
2397:       }
2398: #else
2399:       for (PetscInt i = 0; i < ismax; i++) {
2400:         if (!allcolumns[i]) {
2401:           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2402:           jmax   = ncol[i];
2403:           icol_i = icol[i];
2404:           cmap_i = cmap[i];
2405:           for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2406:         } else cmap[i] = NULL;
2407:       }
2408: #endif
2409:     }

2411:     /* Create lens which is required for MatCreate... */
2412:     jcnt = 0;
2413:     for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i];
2414:     PetscCall(PetscMalloc1(ismax, &lens));

2416:     if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0]));
2417:     for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);

2419:     /* Update lens from local data */
2420:     for (PetscInt i = 0; i < ismax; i++) {
2421:       row2proc_i = row2proc[i];
2422:       jmax       = nrow[i];
2423:       if (!allcolumns[i]) cmap_i = cmap[i];
2424:       irow_i = irow[i];
2425:       lens_i = lens[i];
2426:       for (PetscInt j = 0; j < jmax; j++) {
2427:         row  = irow_i[j];
2428:         proc = row2proc_i[j];
2429:         if (proc == rank) {
2430:           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2431:           if (!allcolumns[i]) {
2432:             for (PetscInt k = 0; k < ncols; k++) {
2433: #if defined(PETSC_USE_CTABLE)
2434:               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2435: #else
2436:               tcol = cmap_i[cols[k]];
2437: #endif
2438:               if (tcol) lens_i[j]++;
2439:             }
2440:           } else { /* allcolumns */
2441:             lens_i[j] = ncols;
2442:           }
2443:           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2444:         }
2445:       }
2446:     }

2448:     /* Create row map: global row of C -> local row of submatrices */
2449: #if defined(PETSC_USE_CTABLE)
2450:     for (PetscInt i = 0; i < ismax; i++) {
2451:       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2452:       irow_i = irow[i];
2453:       jmax   = nrow[i];
2454:       for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2455:     }
2456: #else
2457:     for (PetscInt i = 0; i < ismax; i++) {
2458:       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2459:       rmap_i = rmap[i];
2460:       irow_i = irow[i];
2461:       jmax   = nrow[i];
2462:       for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2463:     }
2464: #endif

2466:     /* Update lens from offproc data */
2467:     {
2468:       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;

2470:       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2471:       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2472:         sbuf1_i = sbuf1[pa[tmp2]];
2473:         jmax    = sbuf1_i[0];
2474:         ct1     = 2 * jmax + 1;
2475:         ct2     = 0;
2476:         rbuf2_i = rbuf2[tmp2];
2477:         rbuf3_i = rbuf3[tmp2];
2478:         for (PetscInt j = 1; j <= jmax; j++) {
2479:           is_no  = sbuf1_i[2 * j - 1];
2480:           max1   = sbuf1_i[2 * j];
2481:           lens_i = lens[is_no];
2482:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2483:           rmap_i = rmap[is_no];
2484:           for (PetscInt k = 0; k < max1; k++, ct1++) {
2485: #if defined(PETSC_USE_CTABLE)
2486:             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2487:             row--;
2488:             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2489: #else
2490:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2491: #endif
2492:             max2 = rbuf2_i[ct1];
2493:             for (PetscInt l = 0; l < max2; l++, ct2++) {
2494:               if (!allcolumns[is_no]) {
2495: #if defined(PETSC_USE_CTABLE)
2496:                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2497: #else
2498:                 tcol = cmap_i[rbuf3_i[ct2]];
2499: #endif
2500:                 if (tcol) lens_i[row]++;
2501:               } else {         /* allcolumns */
2502:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2503:               }
2504:             }
2505:           }
2506:         }
2507:       }
2508:     }
2509:     PetscCall(PetscFree(r_waits3));
2510:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2511:     PetscCall(PetscFree(s_waits3));

2513:     /* Create the submatrices */
2514:     for (PetscInt i = 0; i < ismax; i++) {
2515:       PetscInt rbs, cbs;

2517:       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2518:       PetscCall(ISGetBlockSize(iscol[i], &cbs));

2520:       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2521:       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));

2523:       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2524:       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2525:       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));

2527:       /* create struct Mat_SubSppt and attached it to submat */
2528:       PetscCall(PetscNew(&smat_i));
2529:       subc            = (Mat_SeqAIJ *)submats[i]->data;
2530:       subc->submatis1 = smat_i;

2532:       smat_i->destroy          = submats[i]->ops->destroy;
2533:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2534:       submats[i]->factortype   = C->factortype;

2536:       smat_i->id          = i;
2537:       smat_i->nrqs        = nrqs;
2538:       smat_i->nrqr        = nrqr;
2539:       smat_i->rbuf1       = rbuf1;
2540:       smat_i->rbuf2       = rbuf2;
2541:       smat_i->rbuf3       = rbuf3;
2542:       smat_i->sbuf2       = sbuf2;
2543:       smat_i->req_source2 = req_source2;

2545:       smat_i->sbuf1 = sbuf1;
2546:       smat_i->ptr   = ptr;
2547:       smat_i->tmp   = tmp;
2548:       smat_i->ctr   = ctr;

2550:       smat_i->pa          = pa;
2551:       smat_i->req_size    = req_size;
2552:       smat_i->req_source1 = req_source1;

2554:       smat_i->allcolumns = allcolumns[i];
2555:       smat_i->singleis   = PETSC_FALSE;
2556:       smat_i->row2proc   = row2proc[i];
2557:       smat_i->rmap       = rmap[i];
2558:       smat_i->cmap       = cmap[i];
2559:     }

2561:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2562:       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2563:       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2564:       PetscCall(MatSetType(submats[0], MATDUMMY));

2566:       /* create struct Mat_SubSppt and attached it to submat */
2567:       PetscCall(PetscNew(&smat_i));
2568:       submats[0]->data = (void *)smat_i;

2570:       smat_i->destroy          = submats[0]->ops->destroy;
2571:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2572:       submats[0]->factortype   = C->factortype;

2574:       smat_i->id          = 0;
2575:       smat_i->nrqs        = nrqs;
2576:       smat_i->nrqr        = nrqr;
2577:       smat_i->rbuf1       = rbuf1;
2578:       smat_i->rbuf2       = rbuf2;
2579:       smat_i->rbuf3       = rbuf3;
2580:       smat_i->sbuf2       = sbuf2;
2581:       smat_i->req_source2 = req_source2;

2583:       smat_i->sbuf1 = sbuf1;
2584:       smat_i->ptr   = ptr;
2585:       smat_i->tmp   = tmp;
2586:       smat_i->ctr   = ctr;

2588:       smat_i->pa          = pa;
2589:       smat_i->req_size    = req_size;
2590:       smat_i->req_source1 = req_source1;

2592:       smat_i->allcolumns = PETSC_FALSE;
2593:       smat_i->singleis   = PETSC_FALSE;
2594:       smat_i->row2proc   = NULL;
2595:       smat_i->rmap       = NULL;
2596:       smat_i->cmap       = NULL;
2597:     }

2599:     if (ismax) PetscCall(PetscFree(lens[0]));
2600:     PetscCall(PetscFree(lens));
2601:     if (sbuf_aj) {
2602:       PetscCall(PetscFree(sbuf_aj[0]));
2603:       PetscCall(PetscFree(sbuf_aj));
2604:     }

2606:   } /* endof scall == MAT_INITIAL_MATRIX */

2608:   /* Post recv matrix values */
2609:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2610:   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2611:   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2612:   for (PetscMPIInt i = 0; i < nrqs; ++i) {
2613:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2614:     PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2615:   }

2617:   /* Allocate sending buffers for a->a, and send them off */
2618:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2619:   jcnt = 0;
2620:   for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i];
2621:   if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0]));
2622:   for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

2624:   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2625:   {
2626:     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2627:     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2628:     PetscInt     cend = C->cmap->rend;
2629:     PetscInt    *b_j  = b->j;
2630:     PetscScalar *vworkA, *vworkB, *a_a, *b_a;

2632:     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2633:     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2634:     for (PetscMPIInt i = 0; i < nrqr; i++) {
2635:       rbuf1_i   = rbuf1[i];
2636:       sbuf_aa_i = sbuf_aa[i];
2637:       ct1       = 2 * rbuf1_i[0] + 1;
2638:       ct2       = 0;
2639:       for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2640:         kmax = rbuf1_i[2 * j];
2641:         for (PetscInt k = 0; k < kmax; k++, ct1++) {
2642:           row    = rbuf1_i[ct1] - rstart;
2643:           nzA    = a_i[row + 1] - a_i[row];
2644:           nzB    = b_i[row + 1] - b_i[row];
2645:           ncols  = nzA + nzB;
2646:           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2647:           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2648:           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);

2650:           /* load the column values for this row into vals*/
2651:           vals = sbuf_aa_i + ct2;

2653:           lwrite = 0;
2654:           for (PetscInt l = 0; l < nzB; l++) {
2655:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2656:           }
2657:           for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2658:           for (PetscInt l = 0; l < nzB; l++) {
2659:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2660:           }

2662:           ct2 += ncols;
2663:         }
2664:       }
2665:       PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2666:     }
2667:     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2668:     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2669:   }

2671:   /* Assemble the matrices */
2672:   /* First assemble the local rows */
2673:   for (PetscInt i = 0; i < ismax; i++) {
2674:     row2proc_i = row2proc[i];
2675:     subc       = (Mat_SeqAIJ *)submats[i]->data;
2676:     imat_ilen  = subc->ilen;
2677:     imat_j     = subc->j;
2678:     imat_i     = subc->i;
2679:     imat_a     = subc->a;

2681:     if (!allcolumns[i]) cmap_i = cmap[i];
2682:     rmap_i = rmap[i];
2683:     irow_i = irow[i];
2684:     jmax   = nrow[i];
2685:     for (PetscInt j = 0; j < jmax; j++) {
2686:       row  = irow_i[j];
2687:       proc = row2proc_i[j];
2688:       if (proc == rank) {
2689:         old_row = row;
2690: #if defined(PETSC_USE_CTABLE)
2691:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2692:         row--;
2693: #else
2694:         row = rmap_i[row];
2695: #endif
2696:         ilen_row = imat_ilen[row];
2697:         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2698:         mat_i = imat_i[row];
2699:         mat_a = imat_a + mat_i;
2700:         mat_j = imat_j + mat_i;
2701:         if (!allcolumns[i]) {
2702:           for (PetscInt k = 0; k < ncols; k++) {
2703: #if defined(PETSC_USE_CTABLE)
2704:             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2705: #else
2706:             tcol = cmap_i[cols[k]];
2707: #endif
2708:             if (tcol) {
2709:               *mat_j++ = tcol - 1;
2710:               *mat_a++ = vals[k];
2711:               ilen_row++;
2712:             }
2713:           }
2714:         } else { /* allcolumns */
2715:           for (PetscInt k = 0; k < ncols; k++) {
2716:             *mat_j++ = cols[k]; /* global col index! */
2717:             *mat_a++ = vals[k];
2718:             ilen_row++;
2719:           }
2720:         }
2721:         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));

2723:         imat_ilen[row] = ilen_row;
2724:       }
2725:     }
2726:   }

2728:   /* Now assemble the off proc rows */
2729:   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2730:   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2731:     sbuf1_i = sbuf1[pa[tmp2]];
2732:     jmax    = sbuf1_i[0];
2733:     ct1     = 2 * jmax + 1;
2734:     ct2     = 0;
2735:     rbuf2_i = rbuf2[tmp2];
2736:     rbuf3_i = rbuf3[tmp2];
2737:     rbuf4_i = rbuf4[tmp2];
2738:     for (PetscInt j = 1; j <= jmax; j++) {
2739:       is_no  = sbuf1_i[2 * j - 1];
2740:       rmap_i = rmap[is_no];
2741:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2742:       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2743:       imat_ilen = subc->ilen;
2744:       imat_j    = subc->j;
2745:       imat_i    = subc->i;
2746:       imat_a    = subc->a;
2747:       max1      = sbuf1_i[2 * j];
2748:       for (PetscInt k = 0; k < max1; k++, ct1++) {
2749:         row = sbuf1_i[ct1];
2750: #if defined(PETSC_USE_CTABLE)
2751:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2752:         row--;
2753: #else
2754:         row = rmap_i[row];
2755: #endif
2756:         ilen  = imat_ilen[row];
2757:         mat_i = imat_i[row];
2758:         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2759:         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2760:         max2  = rbuf2_i[ct1];
2761:         if (!allcolumns[is_no]) {
2762:           for (PetscInt l = 0; l < max2; l++, ct2++) {
2763: #if defined(PETSC_USE_CTABLE)
2764:             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2765: #else
2766:             tcol = cmap_i[rbuf3_i[ct2]];
2767: #endif
2768:             if (tcol) {
2769:               *mat_j++ = tcol - 1;
2770:               *mat_a++ = rbuf4_i[ct2];
2771:               ilen++;
2772:             }
2773:           }
2774:         } else { /* allcolumns */
2775:           for (PetscInt l = 0; l < max2; l++, ct2++) {
2776:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2777:             *mat_a++ = rbuf4_i[ct2];
2778:             ilen++;
2779:           }
2780:         }
2781:         imat_ilen[row] = ilen;
2782:       }
2783:     }
2784:   }

2786:   if (!iscsorted) { /* sort column indices of the rows */
2787:     for (PetscInt i = 0; i < ismax; i++) {
2788:       subc      = (Mat_SeqAIJ *)submats[i]->data;
2789:       imat_j    = subc->j;
2790:       imat_i    = subc->i;
2791:       imat_a    = subc->a;
2792:       imat_ilen = subc->ilen;

2794:       if (allcolumns[i]) continue;
2795:       jmax = nrow[i];
2796:       for (PetscInt j = 0; j < jmax; j++) {
2797:         mat_i = imat_i[j];
2798:         mat_a = imat_a + mat_i;
2799:         mat_j = imat_j + mat_i;
2800:         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2801:       }
2802:     }
2803:   }

2805:   PetscCall(PetscFree(r_waits4));
2806:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2807:   PetscCall(PetscFree(s_waits4));

2809:   /* Restore the indices */
2810:   for (PetscInt i = 0; i < ismax; i++) {
2811:     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2812:     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2813:   }

2815:   for (PetscInt i = 0; i < ismax; i++) {
2816:     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2817:     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2818:   }

2820:   /* Destroy allocated memory */
2821:   if (sbuf_aa) {
2822:     PetscCall(PetscFree(sbuf_aa[0]));
2823:     PetscCall(PetscFree(sbuf_aa));
2824:   }
2825:   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));

2827:   for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2828:   PetscCall(PetscFree(rbuf4));

2830:   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2831:   PetscFunctionReturn(PETSC_SUCCESS);
2832: }

2834: /*
2835:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2836:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2837:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2838:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2839:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2840:  state, and needs to be "assembled" later by compressing B's column space.

2842:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2843:  Following this call, C->A & C->B have been created, even if empty.
2844:  */
2845: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2846: {
2847:   /* If making this function public, change the error returned in this function away from _PLIB. */
2848:   Mat_MPIAIJ     *aij;
2849:   Mat_SeqAIJ     *Baij;
2850:   PetscBool       seqaij, Bdisassembled;
2851:   PetscInt        m, n, *nz, ngcol, col, rstart, rend, shift, count;
2852:   PetscScalar     v;
2853:   const PetscInt *rowindices, *colindices;

2855:   PetscFunctionBegin;
2856:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2857:   if (A) {
2858:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2859:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2860:     if (rowemb) {
2861:       PetscCall(ISGetLocalSize(rowemb, &m));
2862:       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2863:     } else {
2864:       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2865:     }
2866:     if (dcolemb) {
2867:       PetscCall(ISGetLocalSize(dcolemb, &n));
2868:       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2869:     } else {
2870:       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2871:     }
2872:   }
2873:   if (B) {
2874:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2875:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2876:     if (rowemb) {
2877:       PetscCall(ISGetLocalSize(rowemb, &m));
2878:       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2879:     } else {
2880:       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2881:     }
2882:     if (ocolemb) {
2883:       PetscCall(ISGetLocalSize(ocolemb, &n));
2884:       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2885:     } else {
2886:       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2887:     }
2888:   }

2890:   aij = (Mat_MPIAIJ *)C->data;
2891:   if (!aij->A) {
2892:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2893:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2894:     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2895:     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2896:     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2897:   }
2898:   if (A) {
2899:     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2900:   } else {
2901:     PetscCall(MatSetUp(aij->A));
2902:   }
2903:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2904:     /*
2905:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2906:       need to "disassemble" B -- convert it to using C's global indices.
2907:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2909:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2910:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2912:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2913:       At least avoid calling MatSetValues() and the implied searches?
2914:     */

2916:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2917: #if defined(PETSC_USE_CTABLE)
2918:       PetscCall(PetscHMapIDestroy(&aij->colmap));
2919: #else
2920:       PetscCall(PetscFree(aij->colmap));
2921:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2922: #endif
2923:       ngcol = 0;
2924:       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2925:       if (aij->garray) PetscCall(PetscFree(aij->garray));
2926:       PetscCall(VecDestroy(&aij->lvec));
2927:       PetscCall(VecScatterDestroy(&aij->Mvctx));
2928:     }
2929:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2930:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2931:   }
2932:   Bdisassembled = PETSC_FALSE;
2933:   if (!aij->B) {
2934:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2935:     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2936:     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2937:     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2938:     Bdisassembled = PETSC_TRUE;
2939:   }
2940:   if (B) {
2941:     Baij = (Mat_SeqAIJ *)B->data;
2942:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2943:       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2944:       for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2945:       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2946:       PetscCall(PetscFree(nz));
2947:     }

2949:     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2950:     shift      = rend - rstart;
2951:     count      = 0;
2952:     rowindices = NULL;
2953:     colindices = NULL;
2954:     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2955:     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2956:     for (PetscInt i = 0; i < B->rmap->n; i++) {
2957:       PetscInt row;
2958:       row = i;
2959:       if (rowindices) row = rowindices[i];
2960:       for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2961:         col = Baij->j[count];
2962:         if (colindices) col = colindices[col];
2963:         if (Bdisassembled && col >= rstart) col += shift;
2964:         v = Baij->a[count];
2965:         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2966:         ++count;
2967:       }
2968:     }
2969:     /* No assembly for aij->B is necessary. */
2970:     /* FIXME: set aij->B's nonzerostate correctly. */
2971:   } else {
2972:     PetscCall(MatSetUp(aij->B));
2973:   }
2974:   C->preallocated  = PETSC_TRUE;
2975:   C->was_assembled = PETSC_FALSE;
2976:   C->assembled     = PETSC_FALSE;
2977:   /*
2978:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2979:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2980:    */
2981:   PetscFunctionReturn(PETSC_SUCCESS);
2982: }

2984: /*
2985:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2986:  */
2987: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2988: {
2989:   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;

2991:   PetscFunctionBegin;
2992:   PetscAssertPointer(A, 2);
2993:   PetscAssertPointer(B, 3);
2994:   /* FIXME: make sure C is assembled */
2995:   *A = aij->A;
2996:   *B = aij->B;
2997:   /* Note that we don't incref *A and *B, so be careful! */
2998:   PetscFunctionReturn(PETSC_SUCCESS);
2999: }

3001: /*
3002:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3003:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3004: */
3005: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3006: {
3007:   PetscMPIInt size, flag;
3008:   PetscInt    cismax, ispar;
3009:   Mat        *A, *B;
3010:   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;

3012:   PetscFunctionBegin;
3013:   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);

3015:   cismax = 0;
3016:   for (PetscInt i = 0; i < ismax; ++i) {
3017:     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3018:     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3019:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3020:     if (size > 1) ++cismax;
3021:   }

3023:   /*
3024:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3025:      ispar counts the number of parallel ISs across C's comm.
3026:   */
3027:   PetscCallMPI(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3028:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3029:     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3030:     PetscFunctionReturn(PETSC_SUCCESS);
3031:   }

3033:   /* if (ispar) */
3034:   /*
3035:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3036:     These are used to extract the off-diag portion of the resulting parallel matrix.
3037:     The row IS for the off-diag portion is the same as for the diag portion,
3038:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3039:   */
3040:   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3041:   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3042:   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3043:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3044:     if (size > 1) {
3045:       /*
3046:          TODO: This is the part that's ***NOT SCALABLE***.
3047:          To fix this we need to extract just the indices of C's nonzero columns
3048:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3049:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3050:          to be done without serializing on the IS list, so, most likely, it is best
3051:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3052:       */
3053:       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3054:       /* Now we have to
3055:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3056:              were sorted on each rank, concatenated they might no longer be sorted;
3057:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3058:              indices in the nondecreasing order to the original index positions.
3059:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3060:       */
3061:       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3062:       PetscCall(ISSort(ciscol[ii]));
3063:       ++ii;
3064:     }
3065:   }
3066:   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3067:   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3068:     PetscInt        issize;
3069:     const PetscInt *indices;

3071:     /*
3072:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3073:      */
3074:     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3075:     PetscCall(ISSort(isrow[i]));
3076:     PetscCall(ISGetLocalSize(isrow[i], &issize));
3077:     PetscCall(ISGetIndices(isrow[i], &indices));
3078:     for (PetscInt j = 1; j < issize; ++j) {
3079:       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3080:     }
3081:     PetscCall(ISRestoreIndices(isrow[i], &indices));
3082:     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3083:     PetscCall(ISSort(iscol[i]));
3084:     PetscCall(ISGetLocalSize(iscol[i], &issize));
3085:     PetscCall(ISGetIndices(iscol[i], &indices));
3086:     for (PetscInt j = 1; j < issize; ++j) {
3087:       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3088:     }
3089:     PetscCall(ISRestoreIndices(iscol[i], &indices));
3090:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3091:     if (size > 1) {
3092:       cisrow[ii] = isrow[i];
3093:       ++ii;
3094:     }
3095:   }
3096:   /*
3097:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3098:     array of sequential matrices underlying the resulting parallel matrices.
3099:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3100:     contain duplicates.

3102:     There are as many diag matrices as there are original index sets. There are only as many parallel
3103:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3105:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3106:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3107:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3108:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3109:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3110:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3111:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3112:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3113:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3114:       values into A[i] and B[ii] sitting inside the corresponding submat.
3115:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3116:       A[i], B[ii], AA[i] or BB[ii] matrices.
3117:   */
3118:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3119:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));

3121:   /* Now obtain the sequential A and B submatrices separately. */
3122:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3123:   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3124:   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));

3126:   /*
3127:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3128:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3129:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3130:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3131:     to have the same sparsity pattern.
3132:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3133:     must be constructed for C. This is done by setseqmat(s).
3134:   */
3135:   for (PetscInt i = 0, ii = 0; i < ismax; ++i) {
3136:     /*
3137:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3138:        That way we can avoid sorting and computing permutations when reusing.
3139:        To this end:
3140:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3141:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3142:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3143:     */
3144:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3146:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3147:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3148:     if (size > 1) {
3149:       if (scall == MAT_INITIAL_MATRIX) {
3150:         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3151:         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3152:         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3153:         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3154:         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3155:       }
3156:       /*
3157:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3158:       */
3159:       {
3160:         Mat AA = A[i], BB = B[ii];

3162:         if (AA || BB) {
3163:           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3164:           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3165:           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3166:         }
3167:         PetscCall(MatDestroy(&AA));
3168:       }
3169:       PetscCall(ISDestroy(ciscol + ii));
3170:       PetscCall(ISDestroy(ciscol_p + ii));
3171:       ++ii;
3172:     } else { /* if (size == 1) */
3173:       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3174:       if (isrow_p[i] || iscol_p[i]) {
3175:         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3176:         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3177:         /* Otherwise A is extracted straight into (*submats)[i]. */
3178:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3179:         PetscCall(MatDestroy(A + i));
3180:       } else (*submat)[i] = A[i];
3181:     }
3182:     PetscCall(ISDestroy(&isrow_p[i]));
3183:     PetscCall(ISDestroy(&iscol_p[i]));
3184:   }
3185:   PetscCall(PetscFree2(cisrow, ciscol));
3186:   PetscCall(PetscFree2(isrow_p, iscol_p));
3187:   PetscCall(PetscFree(ciscol_p));
3188:   PetscCall(PetscFree(A));
3189:   PetscCall(MatDestroySubMatrices(cismax, &B));
3190:   PetscFunctionReturn(PETSC_SUCCESS);
3191: }

3193: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3194: {
3195:   PetscFunctionBegin;
3196:   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3197:   PetscFunctionReturn(PETSC_SUCCESS);
3198: }