Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);

 21: /*
 22:    Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks

 24:    The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
 25:    that is a very major recoding job.

 27:    Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
 28: */
 29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
 30: {
 31:   PetscFunctionBegin;
 32:   for (PetscInt i = 0; i < imax; i++) {
 33:     if (!is[i]) break;
 34:     PetscInt        n = 0, N, Nmax, Nmin;
 35:     const PetscInt *idx;
 36:     PetscInt       *nidx = NULL;
 37:     MPI_Comm        comm;
 38:     PetscBT         bt;

 40:     PetscCall(ISGetLocalSize(is[i], &N));
 41:     if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
 42:       PetscCall(ISGetIndices(is[i], &idx));
 43:       PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
 44:       Nmin = Nmin / bs;
 45:       Nmax = Nmax / bs;
 46:       PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
 47:       for (PetscInt j = 0; j < N; j++) {
 48:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
 49:       }
 50:       PetscCall(PetscMalloc1(n, &nidx));
 51:       n = 0;
 52:       PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
 53:       for (PetscInt j = 0; j < N; j++) {
 54:         if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
 55:       }
 56:       PetscCall(PetscBTDestroy(&bt));
 57:       PetscCall(ISRestoreIndices(is[i], &idx));
 58:     }
 59:     PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
 60:     PetscCall(ISDestroy(is + i));
 61:     PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
 62:   }
 63:   PetscFunctionReturn(PETSC_SUCCESS);
 64: }

 66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
 67: {
 68:   PetscInt i;

 70:   PetscFunctionBegin;
 71:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 72:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
 73:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 74:   PetscFunctionReturn(PETSC_SUCCESS);
 75: }

 77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
 78: {
 79:   PetscInt i;

 81:   PetscFunctionBegin;
 82:   PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 83:   for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
 84:   if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
 85:   PetscFunctionReturn(PETSC_SUCCESS);
 86: }

 88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
 89: {
 90:   MPI_Comm        comm;
 91:   PetscInt       *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
 92:   PetscInt       *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
 93:   PetscInt        nrecvrows, *sbsizes = NULL, *sbdata = NULL;
 94:   const PetscInt *indices_i, **indices;
 95:   PetscLayout     rmap;
 96:   PetscMPIInt     rank, size, *toranks, *fromranks, nto, nfrom, owner;
 97:   PetscSF         sf;
 98:   PetscSFNode    *remote;

100:   PetscFunctionBegin;
101:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
103:   PetscCallMPI(MPI_Comm_size(comm, &size));
104:   /* get row map to determine where rows should be going */
105:   PetscCall(MatGetLayouts(mat, &rmap, NULL));
106:   /* retrieve IS data and put all together so that we
107:    * can optimize communication
108:    *  */
109:   PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110:   for (i = 0, tlength = 0; i < nidx; i++) {
111:     PetscCall(ISGetLocalSize(is[i], &length[i]));
112:     tlength += length[i];
113:     PetscCall(ISGetIndices(is[i], &indices[i]));
114:   }
115:   /* find these rows on remote processors */
116:   PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117:   PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118:   nrrows = 0;
119:   for (i = 0; i < nidx; i++) {
120:     length_i  = length[i];
121:     indices_i = indices[i];
122:     for (j = 0; j < length_i; j++) {
123:       owner = -1;
124:       PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125:       /* remote processors */
126:       if (owner != rank) {
127:         tosizes_temp[owner]++;               /* number of rows to owner */
128:         rrow_ranks[nrrows]   = owner;        /* processor */
129:         rrow_isids[nrrows]   = i;            /* is id */
130:         remoterows[nrrows++] = indices_i[j]; /* row */
131:       }
132:     }
133:     PetscCall(ISRestoreIndices(is[i], &indices[i]));
134:   }
135:   PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136:   /* test if we need to exchange messages
137:    * generally speaking, we do not need to exchange
138:    * data when overlap is 1
139:    * */
140:   PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141:   /* we do not have any messages
142:    * It usually corresponds to overlap 1
143:    * */
144:   if (!reducednrrows) {
145:     PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146:     PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147:     PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148:     PetscFunctionReturn(PETSC_SUCCESS);
149:   }
150:   nto = 0;
151:   /* send sizes and ranks for building a two-sided communication */
152:   for (i = 0; i < size; i++) {
153:     if (tosizes_temp[i]) {
154:       tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155:       tosizes_temp[i]  = nto;                 /* a map from processor to index */
156:       toranks[nto++]   = i;                   /* processor */
157:     }
158:   }
159:   PetscCall(PetscMalloc1(nto + 1, &toffsets));
160:   toffsets[0] = 0;
161:   for (i = 0; i < nto; i++) {
162:     toffsets[i + 1]    = toffsets[i] + tosizes[2 * i]; /* offsets */
163:     tosizes[2 * i + 1] = toffsets[i];                  /* offsets to send */
164:   }
165:   /* send information to other processors */
166:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167:   nrecvrows = 0;
168:   for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169:   PetscCall(PetscMalloc1(nrecvrows, &remote));
170:   nrecvrows = 0;
171:   for (i = 0; i < nfrom; i++) {
172:     for (j = 0; j < fromsizes[2 * i]; j++) {
173:       remote[nrecvrows].rank    = fromranks[i];
174:       remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175:     }
176:   }
177:   PetscCall(PetscSFCreate(comm, &sf));
178:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181:   PetscCall(PetscSFSetFromOptions(sf));
182:   /* message pair <no of is, row>  */
183:   PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184:   for (i = 0; i < nrrows; i++) {
185:     owner                 = rrow_ranks[i];       /* processor */
186:     j                     = tosizes_temp[owner]; /* index */
187:     todata[toffsets[j]++] = rrow_isids[i];
188:     todata[toffsets[j]++] = remoterows[i];
189:   }
190:   PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191:   PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192:   PetscCall(PetscFree(toffsets));
193:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195:   PetscCall(PetscSFDestroy(&sf));
196:   /* send rows belonging to the remote so that then we could get the overlapping data back */
197:   PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198:   PetscCall(PetscFree2(todata, fromdata));
199:   PetscCall(PetscFree(fromsizes));
200:   PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201:   PetscCall(PetscFree(fromranks));
202:   nrecvrows = 0;
203:   for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204:   PetscCall(PetscCalloc1(nrecvrows, &todata));
205:   PetscCall(PetscMalloc1(nrecvrows, &remote));
206:   nrecvrows = 0;
207:   for (i = 0; i < nto; i++) {
208:     for (j = 0; j < tosizes[2 * i]; j++) {
209:       remote[nrecvrows].rank    = toranks[i];
210:       remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211:     }
212:   }
213:   PetscCall(PetscSFCreate(comm, &sf));
214:   PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216:   PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217:   PetscCall(PetscSFSetFromOptions(sf));
218:   /* overlap communication and computation */
219:   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221:   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222:   PetscCall(PetscSFDestroy(&sf));
223:   PetscCall(PetscFree2(sbdata, sbsizes));
224:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225:   PetscCall(PetscFree(toranks));
226:   PetscCall(PetscFree(tosizes));
227:   PetscCall(PetscFree(todata));
228:   PetscFunctionReturn(PETSC_SUCCESS);
229: }

231: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232: {
233:   PetscInt       *isz, isz_i, i, j, is_id, data_size;
234:   PetscInt        col, lsize, max_lsize, *indices_temp, *indices_i;
235:   const PetscInt *indices_i_temp;
236:   MPI_Comm       *iscomms;

238:   PetscFunctionBegin;
239:   max_lsize = 0;
240:   PetscCall(PetscMalloc1(nidx, &isz));
241:   for (i = 0; i < nidx; i++) {
242:     PetscCall(ISGetLocalSize(is[i], &lsize));
243:     max_lsize = lsize > max_lsize ? lsize : max_lsize;
244:     isz[i]    = lsize;
245:   }
246:   PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247:   for (i = 0; i < nidx; i++) {
248:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249:     PetscCall(ISGetIndices(is[i], &indices_i_temp));
250:     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
251:     PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252:     PetscCall(ISDestroy(&is[i]));
253:   }
254:   /* retrieve information to get row id and its overlap */
255:   for (i = 0; i < nrecvs;) {
256:     is_id     = recvdata[i++];
257:     data_size = recvdata[i++];
258:     indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259:     isz_i     = isz[is_id];
260:     for (j = 0; j < data_size; j++) {
261:       col                = recvdata[i++];
262:       indices_i[isz_i++] = col;
263:     }
264:     isz[is_id] = isz_i;
265:   }
266:   /* remove duplicate entities */
267:   for (i = 0; i < nidx; i++) {
268:     indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
269:     isz_i     = isz[i];
270:     PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271:     PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272:     PetscCall(PetscCommDestroy(&iscomms[i]));
273:   }
274:   PetscCall(PetscFree(isz));
275:   PetscCall(PetscFree2(indices_temp, iscomms));
276:   PetscFunctionReturn(PETSC_SUCCESS);
277: }

279: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280: {
281:   PetscLayout     rmap, cmap;
282:   PetscInt        i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283:   PetscInt        is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284:   PetscInt       *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285:   const PetscInt *gcols, *ai, *aj, *bi, *bj;
286:   Mat             amat, bmat;
287:   PetscMPIInt     rank;
288:   PetscBool       done;
289:   MPI_Comm        comm;

291:   PetscFunctionBegin;
292:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
294:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295:   /* Even if the mat is symmetric, we still assume it is not symmetric */
296:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300:   /* total number of nonzero values is used to estimate the memory usage in the next step */
301:   tnz = ai[an] + bi[bn];
302:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305:   /* to find the longest message */
306:   max_fszs = 0;
307:   for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308:   /* better way to estimate number of nonzero in the mat??? */
309:   PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310:   for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
311:   rows_pos  = 0;
312:   totalrows = 0;
313:   for (i = 0; i < nfrom; i++) {
314:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
315:     /* group data together */
316:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
317:       is_id                       = fromrows[rows_pos++]; /* no of is */
318:       rows_i                      = rows_data[is_id];
319:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320:     }
321:     /* estimate a space to avoid multiple allocations  */
322:     for (j = 0; j < nidx; j++) {
323:       indvc_ij = 0;
324:       rows_i   = rows_data[j];
325:       for (l = 0; l < rows_pos_i[j]; l++) {
326:         row   = rows_i[l] - rstart;
327:         start = ai[row];
328:         end   = ai[row + 1];
329:         for (k = start; k < end; k++) { /* Amat */
330:           col                     = aj[k] + cstart;
331:           indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332:         }
333:         start = bi[row];
334:         end   = bi[row + 1];
335:         for (k = start; k < end; k++) { /* Bmat */
336:           col                     = gcols[bj[k]];
337:           indices_tmp[indvc_ij++] = col;
338:         }
339:       }
340:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341:       indv_counts[i * nidx + j] = indvc_ij;
342:       totalrows += indvc_ij;
343:     }
344:   }
345:   /* message triple <no of is, number of rows, rows> */
346:   PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347:   totalrows = 0;
348:   rows_pos  = 0;
349:   /* use this code again */
350:   for (i = 0; i < nfrom; i++) {
351:     PetscCall(PetscArrayzero(rows_pos_i, nidx));
352:     for (j = 0; j < fromsizes[2 * i]; j += 2) {
353:       is_id                       = fromrows[rows_pos++];
354:       rows_i                      = rows_data[is_id];
355:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356:     }
357:     /* add data  */
358:     for (j = 0; j < nidx; j++) {
359:       if (!indv_counts[i * nidx + j]) continue;
360:       indvc_ij            = 0;
361:       sbdata[totalrows++] = j;
362:       sbdata[totalrows++] = indv_counts[i * nidx + j];
363:       sbsizes[2 * i] += 2;
364:       rows_i = rows_data[j];
365:       for (l = 0; l < rows_pos_i[j]; l++) {
366:         row   = rows_i[l] - rstart;
367:         start = ai[row];
368:         end   = ai[row + 1];
369:         for (k = start; k < end; k++) { /* Amat */
370:           col                     = aj[k] + cstart;
371:           indices_tmp[indvc_ij++] = col;
372:         }
373:         start = bi[row];
374:         end   = bi[row + 1];
375:         for (k = start; k < end; k++) { /* Bmat */
376:           col                     = gcols[bj[k]];
377:           indices_tmp[indvc_ij++] = col;
378:         }
379:       }
380:       PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381:       sbsizes[2 * i] += indvc_ij;
382:       PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383:       totalrows += indvc_ij;
384:     }
385:   }
386:   PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387:   offsets[0] = 0;
388:   for (i = 0; i < nfrom; i++) {
389:     offsets[i + 1]     = offsets[i] + sbsizes[2 * i];
390:     sbsizes[2 * i + 1] = offsets[i];
391:   }
392:   PetscCall(PetscFree(offsets));
393:   if (sbrowsizes) *sbrowsizes = sbsizes;
394:   if (sbrows) *sbrows = sbdata;
395:   PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398:   PetscFunctionReturn(PETSC_SUCCESS);
399: }

401: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402: {
403:   const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404:   PetscInt        tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405:   PetscInt        lsize, lsize_tmp;
406:   PetscMPIInt     rank, owner;
407:   Mat             amat, bmat;
408:   PetscBool       done;
409:   PetscLayout     cmap, rmap;
410:   MPI_Comm        comm;

412:   PetscFunctionBegin;
413:   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
415:   PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416:   PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418:   PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419:   PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420:   /* is it a safe way to compute number of nonzero values ? */
421:   tnz = ai[an] + bi[bn];
422:   PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423:   PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424:   PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425:   /* it is a better way to estimate memory than the old implementation
426:    * where global size of matrix is used
427:    * */
428:   PetscCall(PetscMalloc1(tnz, &indices_temp));
429:   for (i = 0; i < nidx; i++) {
430:     MPI_Comm iscomm;

432:     PetscCall(ISGetLocalSize(is[i], &lsize));
433:     PetscCall(ISGetIndices(is[i], &indices));
434:     lsize_tmp = 0;
435:     for (j = 0; j < lsize; j++) {
436:       owner = -1;
437:       row   = indices[j];
438:       PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439:       if (owner != rank) continue;
440:       /* local number */
441:       row -= rstart;
442:       start = ai[row];
443:       end   = ai[row + 1];
444:       for (k = start; k < end; k++) { /* Amat */
445:         col                       = aj[k] + cstart;
446:         indices_temp[lsize_tmp++] = col;
447:       }
448:       start = bi[row];
449:       end   = bi[row + 1];
450:       for (k = start; k < end; k++) { /* Bmat */
451:         col                       = gcols[bj[k]];
452:         indices_temp[lsize_tmp++] = col;
453:       }
454:     }
455:     PetscCall(ISRestoreIndices(is[i], &indices));
456:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457:     PetscCall(ISDestroy(&is[i]));
458:     PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459:     PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460:     PetscCall(PetscCommDestroy(&iscomm));
461:   }
462:   PetscCall(PetscFree(indices_temp));
463:   PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464:   PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465:   PetscFunctionReturn(PETSC_SUCCESS);
466: }

468: /*
469:   Sample message format:
470:   If a processor A wants processor B to process some elements corresponding
471:   to index sets is[1],is[5]
472:   mesg [0] = 2   (no of index sets in the mesg)
473:   -----------
474:   mesg [1] = 1 => is[1]
475:   mesg [2] = sizeof(is[1]);
476:   -----------
477:   mesg [3] = 5  => is[5]
478:   mesg [4] = sizeof(is[5]);
479:   -----------
480:   mesg [5]
481:   mesg [n]  datas[1]
482:   -----------
483:   mesg[n+1]
484:   mesg[m]  data(is[5])
485:   -----------

487:   Notes:
488:   nrqs - no of requests sent (or to be sent out)
489:   nrqr - no of requests received (which have to be or which have been processed)
490: */
491: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492: {
493:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
494:   PetscMPIInt     *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495:   const PetscInt **idx, *idx_i;
496:   PetscInt        *n, **data, len;
497: #if defined(PETSC_USE_CTABLE)
498:   PetscHMapI *table_data, table_data_i;
499:   PetscInt   *tdata, tcount, tcount_max;
500: #else
501:   PetscInt *data_i, *d_p;
502: #endif
503:   PetscMPIInt  size, rank, tag1, tag2, proc = 0;
504:   PetscInt     M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505:   PetscInt    *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506:   PetscBT     *table;
507:   MPI_Comm     comm;
508:   MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509:   MPI_Status  *recv_status;
510:   MPI_Comm    *iscomms;
511:   char        *t_p;

513:   PetscFunctionBegin;
514:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515:   size = c->size;
516:   rank = c->rank;
517:   M    = C->rmap->N;

519:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));

522:   PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));

524:   for (i = 0; i < imax; i++) {
525:     PetscCall(ISGetIndices(is[i], &idx[i]));
526:     PetscCall(ISGetLocalSize(is[i], &n[i]));
527:   }

529:   /* evaluate communication - mesg to who,length of mesg, and buffer space
530:      required. Based on this, buffers are allocated, and data copied into them  */
531:   PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532:   for (i = 0; i < imax; i++) {
533:     PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534:     idx_i = idx[i];
535:     len   = n[i];
536:     for (j = 0; j < len; j++) {
537:       row = idx_i[j];
538:       PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539:       PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540:       w4[proc]++;
541:     }
542:     for (j = 0; j < size; j++) {
543:       if (w4[j]) {
544:         w1[j] += w4[j];
545:         w3[j]++;
546:       }
547:     }
548:   }

550:   nrqs     = 0; /* no of outgoing messages */
551:   msz      = 0; /* total mesg length (for all proc */
552:   w1[rank] = 0; /* no mesg sent to intself */
553:   w3[rank] = 0;
554:   for (i = 0; i < size; i++) {
555:     if (w1[i]) {
556:       w2[i] = 1;
557:       nrqs++;
558:     } /* there exists a message to proc i */
559:   }
560:   /* pa - is list of processors to communicate with */
561:   PetscCall(PetscMalloc1(nrqs, &pa));
562:   for (i = 0, j = 0; i < size; i++) {
563:     if (w1[i]) {
564:       pa[j] = i;
565:       j++;
566:     }
567:   }

569:   /* Each message would have a header = 1 + 2*(no of IS) + data */
570:   for (i = 0; i < nrqs; i++) {
571:     j = pa[i];
572:     w1[j] += w2[j] + 2 * w3[j];
573:     msz += w1[j];
574:   }

576:   /* Determine the number of messages to expect, their lengths, from from-ids */
577:   PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578:   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

580:   /* Now post the Irecvs corresponding to these messages */
581:   PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));

583:   /* Allocate Memory for outgoing messages */
584:   PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585:   PetscCall(PetscArrayzero(outdat, size));
586:   PetscCall(PetscArrayzero(ptr, size));

588:   {
589:     PetscInt *iptr = tmp, ict = 0;
590:     for (i = 0; i < nrqs; i++) {
591:       j = pa[i];
592:       iptr += ict;
593:       outdat[j] = iptr;
594:       ict       = w1[j];
595:     }
596:   }

598:   /* Form the outgoing messages */
599:   /* plug in the headers */
600:   for (i = 0; i < nrqs; i++) {
601:     j            = pa[i];
602:     outdat[j][0] = 0;
603:     PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604:     ptr[j] = outdat[j] + 2 * w3[j] + 1;
605:   }

607:   /* Memory for doing local proc's work */
608:   {
609:     PetscInt M_BPB_imax = 0;
610: #if defined(PETSC_USE_CTABLE)
611:     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612:     PetscCall(PetscMalloc1(imax, &table_data));
613:     for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614:     PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615:     for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616: #else
617:     PetscInt Mimax = 0;
618:     PetscCall(PetscIntMultError(M, imax, &Mimax));
619:     PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620:     PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621:     for (i = 0; i < imax; i++) {
622:       table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623:       data[i]  = d_p + M * i;
624:     }
625: #endif
626:   }

628:   /* Parse the IS and update local tables and the outgoing buf with the data */
629:   {
630:     PetscInt n_i, isz_i, *outdat_j, ctr_j;
631:     PetscBT  table_i;

633:     for (i = 0; i < imax; i++) {
634:       PetscCall(PetscArrayzero(ctr, size));
635:       n_i     = n[i];
636:       table_i = table[i];
637:       idx_i   = idx[i];
638: #if defined(PETSC_USE_CTABLE)
639:       table_data_i = table_data[i];
640: #else
641:       data_i = data[i];
642: #endif
643:       isz_i = isz[i];
644:       for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645:         row = idx_i[j];
646:         PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647:         if (proc != rank) { /* copy to the outgoing buffer */
648:           ctr[proc]++;
649:           *ptr[proc] = row;
650:           ptr[proc]++;
651:         } else if (!PetscBTLookupSet(table_i, row)) {
652: #if defined(PETSC_USE_CTABLE)
653:           PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654: #else
655:           data_i[isz_i] = row; /* Update the local table */
656: #endif
657:           isz_i++;
658:         }
659:       }
660:       /* Update the headers for the current IS */
661:       for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662:         if ((ctr_j = ctr[j])) {
663:           outdat_j            = outdat[j];
664:           k                   = ++outdat_j[0];
665:           outdat_j[2 * k]     = ctr_j;
666:           outdat_j[2 * k - 1] = i;
667:         }
668:       }
669:       isz[i] = isz_i;
670:     }
671:   }

673:   /*  Now  post the sends */
674:   PetscCall(PetscMalloc1(nrqs, &s_waits1));
675:   for (i = 0; i < nrqs; ++i) {
676:     j = pa[i];
677:     PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678:   }

680:   /* No longer need the original indices */
681:   PetscCall(PetscMalloc1(imax, &iscomms));
682:   for (i = 0; i < imax; ++i) {
683:     PetscCall(ISRestoreIndices(is[i], idx + i));
684:     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685:   }
686:   PetscCall(PetscFree2(*(PetscInt ***)&idx, n));

688:   for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));

690:     /* Do Local work */
691: #if defined(PETSC_USE_CTABLE)
692:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693: #else
694:   PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695: #endif

697:   /* Receive messages */
698:   PetscCall(PetscMalloc1(nrqr, &recv_status));
699:   PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700:   PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));

702:   /* Phase 1 sends are complete - deallocate buffers */
703:   PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704:   PetscCall(PetscFree4(w1, w2, w3, w4));

706:   PetscCall(PetscMalloc1(nrqr, &xdata));
707:   PetscCall(PetscMalloc1(nrqr, &isz1));
708:   PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709:   PetscCall(PetscFree(rbuf[0]));
710:   PetscCall(PetscFree(rbuf));

712:   /* Send the data back */
713:   /* Do a global reduction to know the buffer space req for incoming messages */
714:   {
715:     PetscMPIInt *rw1;

717:     PetscCall(PetscCalloc1(size, &rw1));

719:     for (i = 0; i < nrqr; ++i) {
720:       proc = recv_status[i].MPI_SOURCE;

722:       PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723:       rw1[proc] = isz1[i];
724:     }
725:     PetscCall(PetscFree(onodes1));
726:     PetscCall(PetscFree(olengths1));

728:     /* Determine the number of messages to expect, their lengths, from from-ids */
729:     PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730:     PetscCall(PetscFree(rw1));
731:   }
732:   /* Now post the Irecvs corresponding to these messages */
733:   PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));

735:   /* Now  post the sends */
736:   PetscCall(PetscMalloc1(nrqr, &s_waits2));
737:   for (i = 0; i < nrqr; ++i) {
738:     j = recv_status[i].MPI_SOURCE;
739:     PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740:   }

742:   /* receive work done on other processors */
743:   {
744:     PetscInt    is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745:     PetscMPIInt idex;
746:     PetscBT     table_i;

748:     for (i = 0; i < nrqs; ++i) {
749:       PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750:       /* Process the message */
751:       rbuf2_i = rbuf2[idex];
752:       ct1     = 2 * rbuf2_i[0] + 1;
753:       jmax    = rbuf2[idex][0];
754:       for (j = 1; j <= jmax; j++) {
755:         max     = rbuf2_i[2 * j];
756:         is_no   = rbuf2_i[2 * j - 1];
757:         isz_i   = isz[is_no];
758:         table_i = table[is_no];
759: #if defined(PETSC_USE_CTABLE)
760:         table_data_i = table_data[is_no];
761: #else
762:         data_i = data[is_no];
763: #endif
764:         for (k = 0; k < max; k++, ct1++) {
765:           row = rbuf2_i[ct1];
766:           if (!PetscBTLookupSet(table_i, row)) {
767: #if defined(PETSC_USE_CTABLE)
768:             PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769: #else
770:             data_i[isz_i] = row;
771: #endif
772:             isz_i++;
773:           }
774:         }
775:         isz[is_no] = isz_i;
776:       }
777:     }

779:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780:   }

782: #if defined(PETSC_USE_CTABLE)
783:   tcount_max = 0;
784:   for (i = 0; i < imax; ++i) {
785:     table_data_i = table_data[i];
786:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787:     if (tcount_max < tcount) tcount_max = tcount;
788:   }
789:   PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790: #endif

792:   for (i = 0; i < imax; ++i) {
793: #if defined(PETSC_USE_CTABLE)
794:     PetscHashIter tpos;

796:     table_data_i = table_data[i];
797:     PetscHashIterBegin(table_data_i, tpos);
798:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799:       PetscHashIterGetKey(table_data_i, tpos, k);
800:       PetscHashIterGetVal(table_data_i, tpos, j);
801:       PetscHashIterNext(table_data_i, tpos);
802:       tdata[--j] = --k;
803:     }
804:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805: #else
806:     PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807: #endif
808:     PetscCall(PetscCommDestroy(&iscomms[i]));
809:   }

811:   PetscCall(PetscFree(iscomms));
812:   PetscCall(PetscFree(onodes2));
813:   PetscCall(PetscFree(olengths2));

815:   PetscCall(PetscFree(pa));
816:   PetscCall(PetscFree(rbuf2[0]));
817:   PetscCall(PetscFree(rbuf2));
818:   PetscCall(PetscFree(s_waits1));
819:   PetscCall(PetscFree(r_waits1));
820:   PetscCall(PetscFree(s_waits2));
821:   PetscCall(PetscFree(r_waits2));
822:   PetscCall(PetscFree(recv_status));
823:   if (xdata) {
824:     PetscCall(PetscFree(xdata[0]));
825:     PetscCall(PetscFree(xdata));
826:   }
827:   PetscCall(PetscFree(isz1));
828: #if defined(PETSC_USE_CTABLE)
829:   for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830:   PetscCall(PetscFree(table_data));
831:   PetscCall(PetscFree(tdata));
832:   PetscCall(PetscFree4(table, data, isz, t_p));
833: #else
834:   PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835: #endif
836:   PetscFunctionReturn(PETSC_SUCCESS);
837: }

839: /*
840:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841:        the work on the local processor.

843:      Inputs:
844:       C      - MAT_MPIAIJ;
845:       imax - total no of index sets processed at a time;
846:       table  - an array of char - size = m bits.

848:      Output:
849:       isz    - array containing the count of the solution elements corresponding
850:                to each index set;
851:       data or table_data  - pointer to the solutions
852: */
853: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854: {
855:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856:   Mat         A = c->A, B = c->B;
857:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858:   PetscInt    start, end, val, max, rstart, cstart, *ai, *aj;
859:   PetscInt   *bi, *bj, *garray, i, j, k, row, isz_i;
860:   PetscBT     table_i;
861: #if defined(PETSC_USE_CTABLE)
862:   PetscHMapI    table_data_i;
863:   PetscHashIter tpos;
864:   PetscInt      tcount, *tdata;
865: #else
866:   PetscInt *data_i;
867: #endif

869:   PetscFunctionBegin;
870:   rstart = C->rmap->rstart;
871:   cstart = C->cmap->rstart;
872:   ai     = a->i;
873:   aj     = a->j;
874:   bi     = b->i;
875:   bj     = b->j;
876:   garray = c->garray;

878:   for (i = 0; i < imax; i++) {
879: #if defined(PETSC_USE_CTABLE)
880:     /* copy existing entries of table_data_i into tdata[] */
881:     table_data_i = table_data[i];
882:     PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883:     PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);

885:     PetscCall(PetscMalloc1(tcount, &tdata));
886:     PetscHashIterBegin(table_data_i, tpos);
887:     while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888:       PetscHashIterGetKey(table_data_i, tpos, row);
889:       PetscHashIterGetVal(table_data_i, tpos, j);
890:       PetscHashIterNext(table_data_i, tpos);
891:       tdata[--j] = --row;
892:       PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893:     }
894: #else
895:     data_i = data[i];
896: #endif
897:     table_i = table[i];
898:     isz_i   = isz[i];
899:     max     = isz[i];

901:     for (j = 0; j < max; j++) {
902: #if defined(PETSC_USE_CTABLE)
903:       row = tdata[j] - rstart;
904: #else
905:       row = data_i[j] - rstart;
906: #endif
907:       start = ai[row];
908:       end   = ai[row + 1];
909:       for (k = start; k < end; k++) { /* Amat */
910:         val = aj[k] + cstart;
911:         if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914: #else
915:           data_i[isz_i] = val;
916: #endif
917:           isz_i++;
918:         }
919:       }
920:       start = bi[row];
921:       end   = bi[row + 1];
922:       for (k = start; k < end; k++) { /* Bmat */
923:         val = garray[bj[k]];
924:         if (!PetscBTLookupSet(table_i, val)) {
925: #if defined(PETSC_USE_CTABLE)
926:           PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927: #else
928:           data_i[isz_i] = val;
929: #endif
930:           isz_i++;
931:         }
932:       }
933:     }
934:     isz[i] = isz_i;

936: #if defined(PETSC_USE_CTABLE)
937:     PetscCall(PetscFree(tdata));
938: #endif
939:   }
940:   PetscFunctionReturn(PETSC_SUCCESS);
941: }

943: /*
944:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945:          and return the output

947:          Input:
948:            C    - the matrix
949:            nrqr - no of messages being processed.
950:            rbuf - an array of pointers to the received requests

952:          Output:
953:            xdata - array of messages to be sent back
954:            isz1  - size of each message

956:   For better efficiency perhaps we should malloc separately each xdata[i],
957: then if a remalloc is required we need only copy the data for that one row
958: rather than all previous rows as it is now where a single large chunk of
959: memory is used.

961: */
962: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963: {
964:   Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965:   Mat         A = c->A, B = c->B;
966:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967:   PetscInt    rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968:   PetscInt    row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969:   PetscInt    val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970:   PetscInt   *rbuf_i, kmax, rbuf_0;
971:   PetscBT     xtable;

973:   PetscFunctionBegin;
974:   m      = C->rmap->N;
975:   rstart = C->rmap->rstart;
976:   cstart = C->cmap->rstart;
977:   ai     = a->i;
978:   aj     = a->j;
979:   bi     = b->i;
980:   bj     = b->j;
981:   garray = c->garray;

983:   for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984:     rbuf_i = rbuf[i];
985:     rbuf_0 = rbuf_i[0];
986:     ct += rbuf_0;
987:     for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988:   }

990:   if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991:   else max1 = 1;
992:   mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993:   if (nrqr) {
994:     PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995:     ++no_malloc;
996:   }
997:   PetscCall(PetscBTCreate(m, &xtable));
998:   PetscCall(PetscArrayzero(isz1, nrqr));

1000:   ct3 = 0;
1001:   for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002:     rbuf_i = rbuf[i];
1003:     rbuf_0 = rbuf_i[0];
1004:     ct1    = 2 * rbuf_0 + 1;
1005:     ct2    = ct1;
1006:     ct3 += ct1;
1007:     for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008:       PetscCall(PetscBTMemzero(m, xtable));
1009:       oct2 = ct2;
1010:       kmax = rbuf_i[2 * j];
1011:       for (k = 0; k < kmax; k++, ct1++) {
1012:         row = rbuf_i[ct1];
1013:         if (!PetscBTLookupSet(xtable, row)) {
1014:           if (!(ct3 < mem_estimate)) {
1015:             new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016:             PetscCall(PetscMalloc1(new_estimate, &tmp));
1017:             PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018:             PetscCall(PetscFree(xdata[0]));
1019:             xdata[0]     = tmp;
1020:             mem_estimate = new_estimate;
1021:             ++no_malloc;
1022:             for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023:           }
1024:           xdata[i][ct2++] = row;
1025:           ct3++;
1026:         }
1027:       }
1028:       for (k = oct2, max2 = ct2; k < max2; k++) {
1029:         row   = xdata[i][k] - rstart;
1030:         start = ai[row];
1031:         end   = ai[row + 1];
1032:         for (l = start; l < end; l++) {
1033:           val = aj[l] + cstart;
1034:           if (!PetscBTLookupSet(xtable, val)) {
1035:             if (!(ct3 < mem_estimate)) {
1036:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1038:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039:               PetscCall(PetscFree(xdata[0]));
1040:               xdata[0]     = tmp;
1041:               mem_estimate = new_estimate;
1042:               ++no_malloc;
1043:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044:             }
1045:             xdata[i][ct2++] = val;
1046:             ct3++;
1047:           }
1048:         }
1049:         start = bi[row];
1050:         end   = bi[row + 1];
1051:         for (l = start; l < end; l++) {
1052:           val = garray[bj[l]];
1053:           if (!PetscBTLookupSet(xtable, val)) {
1054:             if (!(ct3 < mem_estimate)) {
1055:               new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056:               PetscCall(PetscMalloc1(new_estimate, &tmp));
1057:               PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058:               PetscCall(PetscFree(xdata[0]));
1059:               xdata[0]     = tmp;
1060:               mem_estimate = new_estimate;
1061:               ++no_malloc;
1062:               for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063:             }
1064:             xdata[i][ct2++] = val;
1065:             ct3++;
1066:           }
1067:         }
1068:       }
1069:       /* Update the header*/
1070:       xdata[i][2 * j]     = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071:       xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072:     }
1073:     xdata[i][0] = rbuf_0;
1074:     if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075:     isz1[i] = ct2; /* size of each message */
1076:   }
1077:   PetscCall(PetscBTDestroy(&xtable));
1078:   PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079:   PetscFunctionReturn(PETSC_SUCCESS);
1080: }

1082: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083: /*
1084:     Every processor gets the entire matrix
1085: */
1086: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087: {
1088:   Mat         B;
1089:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090:   Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091:   PetscMPIInt size, rank;
1092:   PetscInt    sendcount, i, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1093:   PetscInt    m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;

1095:   PetscFunctionBegin;
1096:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098:   if (scall == MAT_INITIAL_MATRIX) {
1099:     /* Tell every processor the number of nonzeros per row */
1100:     PetscCall(PetscCalloc1(A->rmap->N, &lens));
1101:     for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];

1103:     /* All MPI processes get the same matrix */
1104:     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));

1106:     /*     Create the sequential matrix of the same type as the local block diagonal  */
1107:     PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1108:     PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1109:     PetscCall(MatSetBlockSizesFromMats(B, A, A));
1110:     PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1111:     PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1112:     PetscCall(PetscCalloc1(2, Bin));
1113:     **Bin = B;
1114:     b     = (Mat_SeqAIJ *)B->data;

1116:     /* zero column space */
1117:     nrecv = 0;
1118:     for (i = 0; i < size; i++) {
1119:       for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1120:     }
1121:     PetscCall(PetscArrayzero(b->j, nrecv));

1123:     /*   Copy my part of matrix column indices over    */
1124:     sendcount  = ad->nz + bd->nz;
1125:     jsendbuf   = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1126:     a_jsendbuf = ad->j;
1127:     b_jsendbuf = bd->j;
1128:     n          = A->rmap->rend - A->rmap->rstart;
1129:     cnt        = 0;
1130:     for (i = 0; i < n; i++) {
1131:       /* put in lower diagonal portion */
1132:       m = bd->i[i + 1] - bd->i[i];
1133:       while (m > 0) {
1134:         /* is it above diagonal (in bd (compressed) numbering) */
1135:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1136:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1137:         m--;
1138:       }

1140:       /* put in diagonal portion */
1141:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;

1143:       /* put in upper diagonal portion */
1144:       while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1145:     }
1146:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1148:     /* send column indices, b->j was zeroed */
1149:     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));

1151:     /*  Assemble the matrix into useable form (numerical values not yet set) */
1152:     /* set the b->ilen (length of each row) values */
1153:     PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154:     /* set the b->i indices */
1155:     b->i[0] = 0;
1156:     for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157:     PetscCall(PetscFree(lens));
1158:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));

1161:   } else {
1162:     B = **Bin;
1163:     b = (Mat_SeqAIJ *)B->data;
1164:   }

1166:   /* Copy my part of matrix numerical values into the values location */
1167:   if (flag == MAT_GET_VALUES) {
1168:     const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169:     MatScalar         *sendbuf;

1171:     /* initialize b->a */
1172:     PetscCall(PetscArrayzero(b->a, b->nz));

1174:     PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1175:     PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1176:     sendcount = ad->nz + bd->nz;
1177:     sendbuf   = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1178:     a_sendbuf = ada;
1179:     b_sendbuf = bda;
1180:     b_sendj   = bd->j;
1181:     n         = A->rmap->rend - A->rmap->rstart;
1182:     cnt       = 0;
1183:     for (i = 0; i < n; i++) {
1184:       /* put in lower diagonal portion */
1185:       m = bd->i[i + 1] - bd->i[i];
1186:       while (m > 0) {
1187:         /* is it above diagonal (in bd (compressed) numbering) */
1188:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1189:         sendbuf[cnt++] = *b_sendbuf++;
1190:         m--;
1191:         b_sendj++;
1192:       }

1194:       /* put in diagonal portion */
1195:       for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;

1197:       /* put in upper diagonal portion */
1198:       while (m-- > 0) {
1199:         sendbuf[cnt++] = *b_sendbuf++;
1200:         b_sendj++;
1201:       }
1202:     }
1203:     PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);

1205:     /* send values, b->a was zeroed */
1206:     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));

1208:     PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1209:     PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1210:   } /* endof (flag == MAT_GET_VALUES) */

1212:   PetscCall(MatPropagateSymmetryOptions(A, B));
1213:   PetscFunctionReturn(PETSC_SUCCESS);
1214: }

1216: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1217: {
1218:   Mat_MPIAIJ     *c = (Mat_MPIAIJ *)C->data;
1219:   Mat             submat, A = c->A, B = c->B;
1220:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1221:   PetscInt       *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1222:   PetscInt        cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1223:   const PetscInt *icol, *irow;
1224:   PetscInt        nrow, ncol, start;
1225:   PetscMPIInt     rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1226:   PetscInt      **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1227:   PetscInt        nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1228:   PetscInt      **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1229:   PetscInt       *lens, rmax, ncols, *cols, Crow;
1230: #if defined(PETSC_USE_CTABLE)
1231:   PetscHMapI cmap, rmap;
1232:   PetscInt  *cmap_loc, *rmap_loc;
1233: #else
1234:   PetscInt *cmap, *rmap;
1235: #endif
1236:   PetscInt      ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1237:   PetscInt     *cworkB, lwrite, *subcols, *row2proc;
1238:   PetscScalar  *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1239:   MPI_Request  *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1240:   MPI_Request  *r_waits4, *s_waits3 = NULL, *s_waits4;
1241:   MPI_Status   *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1242:   MPI_Status   *r_status3 = NULL, *r_status4, *s_status4;
1243:   MPI_Comm      comm;
1244:   PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1245:   PetscMPIInt  *onodes1, *olengths1, idex, end;
1246:   Mat_SubSppt  *smatis1;
1247:   PetscBool     isrowsorted, iscolsorted;

1249:   PetscFunctionBegin;
1252:   PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1253:   PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1254:   PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1255:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1256:   size = c->size;
1257:   rank = c->rank;

1259:   PetscCall(ISSorted(iscol[0], &iscolsorted));
1260:   PetscCall(ISSorted(isrow[0], &isrowsorted));
1261:   PetscCall(ISGetIndices(isrow[0], &irow));
1262:   PetscCall(ISGetLocalSize(isrow[0], &nrow));
1263:   if (allcolumns) {
1264:     icol = NULL;
1265:     ncol = C->cmap->N;
1266:   } else {
1267:     PetscCall(ISGetIndices(iscol[0], &icol));
1268:     PetscCall(ISGetLocalSize(iscol[0], &ncol));
1269:   }

1271:   if (scall == MAT_INITIAL_MATRIX) {
1272:     PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;

1274:     /* Get some new tags to keep the communication clean */
1275:     tag1 = ((PetscObject)C)->tag;
1276:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1277:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

1279:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1280:      required. Based on this, buffers are allocated, and data copied into them */
1281:     PetscCall(PetscCalloc2(size, &w1, size, &w2));
1282:     PetscCall(PetscMalloc1(nrow, &row2proc));

1284:     /* w1[proc] = num of rows owned by proc -- to be requested */
1285:     proc = 0;
1286:     nrqs = 0; /* num of outgoing messages */
1287:     for (j = 0; j < nrow; j++) {
1288:       row = irow[j];
1289:       if (!isrowsorted) proc = 0;
1290:       while (row >= C->rmap->range[proc + 1]) proc++;
1291:       w1[proc]++;
1292:       row2proc[j] = proc; /* map row index to proc */

1294:       if (proc != rank && !w2[proc]) {
1295:         w2[proc] = 1;
1296:         nrqs++;
1297:       }
1298:     }
1299:     w1[rank] = 0; /* rows owned by self will not be requested */

1301:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1302:     for (proc = 0, j = 0; proc < size; proc++) {
1303:       if (w1[proc]) pa[j++] = proc;
1304:     }

1306:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1307:     msz = 0; /* total mesg length (for all procs) */
1308:     for (i = 0; i < nrqs; i++) {
1309:       proc = pa[i];
1310:       w1[proc] += 3;
1311:       msz += w1[proc];
1312:     }
1313:     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));

1315:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1316:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1317:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));

1319:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1320:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1321:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

1323:     /* Now post the Irecvs corresponding to these messages */
1324:     PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

1326:     PetscCall(PetscFree(onodes1));
1327:     PetscCall(PetscFree(olengths1));

1329:     /* Allocate Memory for outgoing messages */
1330:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1331:     PetscCall(PetscArrayzero(sbuf1, size));
1332:     PetscCall(PetscArrayzero(ptr, size));

1334:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1335:     iptr = tmp;
1336:     for (i = 0; i < nrqs; i++) {
1337:       proc        = pa[i];
1338:       sbuf1[proc] = iptr;
1339:       iptr += w1[proc];
1340:     }

1342:     /* Form the outgoing messages */
1343:     /* Initialize the header space */
1344:     for (i = 0; i < nrqs; i++) {
1345:       proc = pa[i];
1346:       PetscCall(PetscArrayzero(sbuf1[proc], 3));
1347:       ptr[proc] = sbuf1[proc] + 3;
1348:     }

1350:     /* Parse the isrow and copy data into outbuf */
1351:     PetscCall(PetscArrayzero(ctr, size));
1352:     for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1353:       proc = row2proc[j];
1354:       if (proc != rank) { /* copy to the outgoing buf*/
1355:         *ptr[proc] = irow[j];
1356:         ctr[proc]++;
1357:         ptr[proc]++;
1358:       }
1359:     }

1361:     /* Update the headers for the current IS */
1362:     for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1363:       if ((ctr_j = ctr[j])) {
1364:         sbuf1_j            = sbuf1[j];
1365:         k                  = ++sbuf1_j[0];
1366:         sbuf1_j[2 * k]     = ctr_j;
1367:         sbuf1_j[2 * k - 1] = 0;
1368:       }
1369:     }

1371:     /* Now post the sends */
1372:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
1373:     for (i = 0; i < nrqs; ++i) {
1374:       proc = pa[i];
1375:       PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1376:     }

1378:     /* Post Receives to capture the buffer size */
1379:     PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1380:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));

1382:     if (nrqs) rbuf2[0] = tmp + msz;
1383:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];

1385:     for (i = 0; i < nrqs; ++i) {
1386:       proc = pa[i];
1387:       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1388:     }

1390:     PetscCall(PetscFree2(w1, w2));

1392:     /* Send to other procs the buf size they should allocate */
1393:     /* Receive messages*/
1394:     PetscCall(PetscMalloc1(nrqr, &r_status1));
1395:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));

1397:     PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1398:     for (i = 0; i < nrqr; ++i) {
1399:       req_size[i] = 0;
1400:       rbuf1_i     = rbuf1[i];
1401:       start       = 2 * rbuf1_i[0] + 1;
1402:       PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1403:       PetscCall(PetscMalloc1(end, &sbuf2[i]));
1404:       sbuf2_i = sbuf2[i];
1405:       for (j = start; j < end; j++) {
1406:         k          = rbuf1_i[j] - rstart;
1407:         ncols      = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1408:         sbuf2_i[j] = ncols;
1409:         req_size[i] += ncols;
1410:       }
1411:       req_source1[i] = r_status1[i].MPI_SOURCE;

1413:       /* form the header */
1414:       sbuf2_i[0] = req_size[i];
1415:       for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

1417:       PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1418:     }

1420:     PetscCall(PetscFree(r_status1));
1421:     PetscCall(PetscFree(r_waits1));

1423:     /* rbuf2 is received, Post recv column indices a->j */
1424:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));

1426:     PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1427:     for (i = 0; i < nrqs; ++i) {
1428:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1429:       req_source2[i] = r_status2[i].MPI_SOURCE;
1430:       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1431:     }

1433:     /* Wait on sends1 and sends2 */
1434:     PetscCall(PetscMalloc1(nrqs, &s_status1));
1435:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1436:     PetscCall(PetscFree(s_waits1));
1437:     PetscCall(PetscFree(s_status1));

1439:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1440:     PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));

1442:     /* Now allocate sending buffers for a->j, and send them off */
1443:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1444:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1445:     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1446:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

1448:     for (i = 0; i < nrqr; i++) { /* for each requested message */
1449:       rbuf1_i   = rbuf1[i];
1450:       sbuf_aj_i = sbuf_aj[i];
1451:       ct1       = 2 * rbuf1_i[0] + 1;
1452:       ct2       = 0;
1453:       /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */

1455:       kmax = rbuf1[i][2];
1456:       for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1457:         row    = rbuf1_i[ct1] - rstart;
1458:         nzA    = ai[row + 1] - ai[row];
1459:         nzB    = bi[row + 1] - bi[row];
1460:         ncols  = nzA + nzB;
1461:         cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1462:         cworkB = PetscSafePointerPlusOffset(bj, bi[row]);

1464:         /* load the column indices for this row into cols*/
1465:         cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);

1467:         lwrite = 0;
1468:         for (l = 0; l < nzB; l++) {
1469:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1470:         }
1471:         for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1472:         for (l = 0; l < nzB; l++) {
1473:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1474:         }

1476:         ct2 += ncols;
1477:       }
1478:       PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1479:     }

1481:     /* create column map (cmap): global col of C -> local col of submat */
1482: #if defined(PETSC_USE_CTABLE)
1483:     if (!allcolumns) {
1484:       PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1485:       PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1486:       for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1487:         if (icol[j] >= cstart && icol[j] < cend) {
1488:           cmap_loc[icol[j] - cstart] = j + 1;
1489:         } else { /* use PetscHMapI for non-local col indices */
1490:           PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1491:         }
1492:       }
1493:     } else {
1494:       cmap     = NULL;
1495:       cmap_loc = NULL;
1496:     }
1497:     PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1498: #else
1499:     if (!allcolumns) {
1500:       PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1501:       for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1502:     } else {
1503:       cmap = NULL;
1504:     }
1505: #endif

1507:     /* Create lens for MatSeqAIJSetPreallocation() */
1508:     PetscCall(PetscCalloc1(nrow, &lens));

1510:     /* Compute lens from local part of C */
1511:     for (j = 0; j < nrow; j++) {
1512:       row  = irow[j];
1513:       proc = row2proc[j];
1514:       if (proc == rank) {
1515:         /* diagonal part A = c->A */
1516:         ncols = ai[row - rstart + 1] - ai[row - rstart];
1517:         cols  = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1518:         if (!allcolumns) {
1519:           for (k = 0; k < ncols; k++) {
1520: #if defined(PETSC_USE_CTABLE)
1521:             tcol = cmap_loc[cols[k]];
1522: #else
1523:             tcol = cmap[cols[k] + cstart];
1524: #endif
1525:             if (tcol) lens[j]++;
1526:           }
1527:         } else { /* allcolumns */
1528:           lens[j] = ncols;
1529:         }

1531:         /* off-diagonal part B = c->B */
1532:         ncols = bi[row - rstart + 1] - bi[row - rstart];
1533:         cols  = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1534:         if (!allcolumns) {
1535:           for (k = 0; k < ncols; k++) {
1536: #if defined(PETSC_USE_CTABLE)
1537:             PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1538: #else
1539:             tcol = cmap[bmap[cols[k]]];
1540: #endif
1541:             if (tcol) lens[j]++;
1542:           }
1543:         } else { /* allcolumns */
1544:           lens[j] += ncols;
1545:         }
1546:       }
1547:     }

1549:     /* Create row map (rmap): global row of C -> local row of submat */
1550: #if defined(PETSC_USE_CTABLE)
1551:     PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1552:     for (j = 0; j < nrow; j++) {
1553:       row  = irow[j];
1554:       proc = row2proc[j];
1555:       if (proc == rank) { /* a local row */
1556:         rmap_loc[row - rstart] = j;
1557:       } else {
1558:         PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1559:       }
1560:     }
1561: #else
1562:     PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1563:     for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1564: #endif

1566:     /* Update lens from offproc data */
1567:     /* recv a->j is done */
1568:     PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1569:     for (i = 0; i < nrqs; i++) {
1570:       proc    = pa[i];
1571:       sbuf1_i = sbuf1[proc];
1572:       /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1573:       ct1     = 2 + 1;
1574:       ct2     = 0;
1575:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1576:       rbuf3_i = rbuf3[i]; /* received C->j */

1578:       /* is_no  = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1579:       max1 = sbuf1_i[2];
1580:       for (k = 0; k < max1; k++, ct1++) {
1581: #if defined(PETSC_USE_CTABLE)
1582:         PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1583:         row--;
1584:         PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1585: #else
1586:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1587: #endif
1588:         /* Now, store row index of submat in sbuf1_i[ct1] */
1589:         sbuf1_i[ct1] = row;

1591:         nnz = rbuf2_i[ct1];
1592:         if (!allcolumns) {
1593:           for (l = 0; l < nnz; l++, ct2++) {
1594: #if defined(PETSC_USE_CTABLE)
1595:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1596:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1597:             } else {
1598:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1599:             }
1600: #else
1601:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1602: #endif
1603:             if (tcol) lens[row]++;
1604:           }
1605:         } else { /* allcolumns */
1606:           lens[row] += nnz;
1607:         }
1608:       }
1609:     }
1610:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1611:     PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));

1613:     /* Create the submatrices */
1614:     PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1615:     PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));

1617:     PetscCall(ISGetBlockSize(isrow[0], &i));
1618:     PetscCall(ISGetBlockSize(iscol[0], &j));
1619:     if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j));
1620:     PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1621:     PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));

1623:     /* create struct Mat_SubSppt and attached it to submat */
1624:     PetscCall(PetscNew(&smatis1));
1625:     subc            = (Mat_SeqAIJ *)submat->data;
1626:     subc->submatis1 = smatis1;

1628:     smatis1->id          = 0;
1629:     smatis1->nrqs        = nrqs;
1630:     smatis1->nrqr        = nrqr;
1631:     smatis1->rbuf1       = rbuf1;
1632:     smatis1->rbuf2       = rbuf2;
1633:     smatis1->rbuf3       = rbuf3;
1634:     smatis1->sbuf2       = sbuf2;
1635:     smatis1->req_source2 = req_source2;

1637:     smatis1->sbuf1 = sbuf1;
1638:     smatis1->ptr   = ptr;
1639:     smatis1->tmp   = tmp;
1640:     smatis1->ctr   = ctr;

1642:     smatis1->pa          = pa;
1643:     smatis1->req_size    = req_size;
1644:     smatis1->req_source1 = req_source1;

1646:     smatis1->allcolumns = allcolumns;
1647:     smatis1->singleis   = PETSC_TRUE;
1648:     smatis1->row2proc   = row2proc;
1649:     smatis1->rmap       = rmap;
1650:     smatis1->cmap       = cmap;
1651: #if defined(PETSC_USE_CTABLE)
1652:     smatis1->rmap_loc = rmap_loc;
1653:     smatis1->cmap_loc = cmap_loc;
1654: #endif

1656:     smatis1->destroy     = submat->ops->destroy;
1657:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1658:     submat->factortype   = C->factortype;

1660:     /* compute rmax */
1661:     rmax = 0;
1662:     for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);

1664:   } else { /* scall == MAT_REUSE_MATRIX */
1665:     submat = submats[0];
1666:     PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

1668:     subc        = (Mat_SeqAIJ *)submat->data;
1669:     rmax        = subc->rmax;
1670:     smatis1     = subc->submatis1;
1671:     nrqs        = smatis1->nrqs;
1672:     nrqr        = smatis1->nrqr;
1673:     rbuf1       = smatis1->rbuf1;
1674:     rbuf2       = smatis1->rbuf2;
1675:     rbuf3       = smatis1->rbuf3;
1676:     req_source2 = smatis1->req_source2;

1678:     sbuf1 = smatis1->sbuf1;
1679:     sbuf2 = smatis1->sbuf2;
1680:     ptr   = smatis1->ptr;
1681:     tmp   = smatis1->tmp;
1682:     ctr   = smatis1->ctr;

1684:     pa          = smatis1->pa;
1685:     req_size    = smatis1->req_size;
1686:     req_source1 = smatis1->req_source1;

1688:     allcolumns = smatis1->allcolumns;
1689:     row2proc   = smatis1->row2proc;
1690:     rmap       = smatis1->rmap;
1691:     cmap       = smatis1->cmap;
1692: #if defined(PETSC_USE_CTABLE)
1693:     rmap_loc = smatis1->rmap_loc;
1694:     cmap_loc = smatis1->cmap_loc;
1695: #endif
1696:   }

1698:   /* Post recv matrix values */
1699:   PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1700:   PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1701:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1702:   for (i = 0; i < nrqs; ++i) {
1703:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1704:     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1705:   }

1707:   /* Allocate sending buffers for a->a, and send them off */
1708:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1709:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1710:   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1711:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

1713:   for (i = 0; i < nrqr; i++) {
1714:     rbuf1_i   = rbuf1[i];
1715:     sbuf_aa_i = sbuf_aa[i];
1716:     ct1       = 2 * rbuf1_i[0] + 1;
1717:     ct2       = 0;
1718:     /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1720:     kmax = rbuf1_i[2];
1721:     for (k = 0; k < kmax; k++, ct1++) {
1722:       row    = rbuf1_i[ct1] - rstart;
1723:       nzA    = ai[row + 1] - ai[row];
1724:       nzB    = bi[row + 1] - bi[row];
1725:       ncols  = nzA + nzB;
1726:       cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1727:       vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1728:       vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);

1730:       /* load the column values for this row into vals*/
1731:       vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);

1733:       lwrite = 0;
1734:       for (l = 0; l < nzB; l++) {
1735:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1736:       }
1737:       for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1738:       for (l = 0; l < nzB; l++) {
1739:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1740:       }

1742:       ct2 += ncols;
1743:     }
1744:     PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1745:   }

1747:   /* Assemble submat */
1748:   /* First assemble the local rows */
1749:   for (j = 0; j < nrow; j++) {
1750:     row  = irow[j];
1751:     proc = row2proc[j];
1752:     if (proc == rank) {
1753:       Crow = row - rstart; /* local row index of C */
1754: #if defined(PETSC_USE_CTABLE)
1755:       row = rmap_loc[Crow]; /* row index of submat */
1756: #else
1757:       row = rmap[row];
1758: #endif

1760:       if (allcolumns) {
1761:         /* diagonal part A = c->A */
1762:         ncols = ai[Crow + 1] - ai[Crow];
1763:         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1764:         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1765:         i     = 0;
1766:         for (k = 0; k < ncols; k++) {
1767:           subcols[i]   = cols[k] + cstart;
1768:           subvals[i++] = vals[k];
1769:         }

1771:         /* off-diagonal part B = c->B */
1772:         ncols = bi[Crow + 1] - bi[Crow];
1773:         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1774:         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1775:         for (k = 0; k < ncols; k++) {
1776:           subcols[i]   = bmap[cols[k]];
1777:           subvals[i++] = vals[k];
1778:         }

1780:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));

1782:       } else { /* !allcolumns */
1783: #if defined(PETSC_USE_CTABLE)
1784:         /* diagonal part A = c->A */
1785:         ncols = ai[Crow + 1] - ai[Crow];
1786:         cols  = PetscSafePointerPlusOffset(aj, ai[Crow]);
1787:         vals  = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1788:         i     = 0;
1789:         for (k = 0; k < ncols; k++) {
1790:           tcol = cmap_loc[cols[k]];
1791:           if (tcol) {
1792:             subcols[i]   = --tcol;
1793:             subvals[i++] = vals[k];
1794:           }
1795:         }

1797:         /* off-diagonal part B = c->B */
1798:         ncols = bi[Crow + 1] - bi[Crow];
1799:         cols  = PetscSafePointerPlusOffset(bj, bi[Crow]);
1800:         vals  = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1801:         for (k = 0; k < ncols; k++) {
1802:           PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1803:           if (tcol) {
1804:             subcols[i]   = --tcol;
1805:             subvals[i++] = vals[k];
1806:           }
1807:         }
1808: #else
1809:         /* diagonal part A = c->A */
1810:         ncols = ai[Crow + 1] - ai[Crow];
1811:         cols  = aj + ai[Crow];
1812:         vals  = a_a + ai[Crow];
1813:         i     = 0;
1814:         for (k = 0; k < ncols; k++) {
1815:           tcol = cmap[cols[k] + cstart];
1816:           if (tcol) {
1817:             subcols[i]   = --tcol;
1818:             subvals[i++] = vals[k];
1819:           }
1820:         }

1822:         /* off-diagonal part B = c->B */
1823:         ncols = bi[Crow + 1] - bi[Crow];
1824:         cols  = bj + bi[Crow];
1825:         vals  = b_a + bi[Crow];
1826:         for (k = 0; k < ncols; k++) {
1827:           tcol = cmap[bmap[cols[k]]];
1828:           if (tcol) {
1829:             subcols[i]   = --tcol;
1830:             subvals[i++] = vals[k];
1831:           }
1832:         }
1833: #endif
1834:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1835:       }
1836:     }
1837:   }

1839:   /* Now assemble the off-proc rows */
1840:   for (i = 0; i < nrqs; i++) { /* for each requested message */
1841:     /* recv values from other processes */
1842:     PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1843:     proc    = pa[idex];
1844:     sbuf1_i = sbuf1[proc];
1845:     /* jmax    = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1846:     ct1     = 2 + 1;
1847:     ct2     = 0;           /* count of received C->j */
1848:     ct3     = 0;           /* count of received C->j that will be inserted into submat */
1849:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1853:     /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1854:     max1 = sbuf1_i[2];                  /* num of rows */
1855:     for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856:       row = sbuf1_i[ct1];               /* row index of submat */
1857:       if (!allcolumns) {
1858:         idex = 0;
1859:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860:           nnz = rbuf2_i[ct1];                /* num of C entries in this row */
1861:           for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862: #if defined(PETSC_USE_CTABLE)
1863:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865:             } else {
1866:               PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1867:             }
1868: #else
1869:             tcol = cmap[rbuf3_i[ct2]];
1870: #endif
1871:             if (tcol) {
1872:               subcols[idex]   = --tcol; /* may not be sorted */
1873:               subvals[idex++] = rbuf4_i[ct2];

1875:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876:                For reuse, we replace received C->j with index that should be inserted to submat */
1877:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878:             }
1879:           }
1880:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1881:         } else { /* scall == MAT_REUSE_MATRIX */
1882:           submat = submats[0];
1883:           subc   = (Mat_SeqAIJ *)submat->data;

1885:           nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886:           for (l = 0; l < nnz; l++) {
1887:             ct2             = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888:             subvals[idex++] = rbuf4_i[ct2];
1889:           }

1891:           bj = subc->j + subc->i[row]; /* sorted column indices */
1892:           PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1893:         }
1894:       } else {              /* allcolumns */
1895:         nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896:         PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1897:         ct2 += nnz;
1898:       }
1899:     }
1900:   }

1902:   /* sending a->a are done */
1903:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1904:   PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));

1906:   PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1907:   PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1908:   submats[0] = submat;

1910:   /* Restore the indices */
1911:   PetscCall(ISRestoreIndices(isrow[0], &irow));
1912:   if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));

1914:   /* Destroy allocated memory */
1915:   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1916:   PetscCall(PetscFree3(rbuf4, subcols, subvals));
1917:   if (sbuf_aa) {
1918:     PetscCall(PetscFree(sbuf_aa[0]));
1919:     PetscCall(PetscFree(sbuf_aa));
1920:   }

1922:   if (scall == MAT_INITIAL_MATRIX) {
1923:     PetscCall(PetscFree(lens));
1924:     if (sbuf_aj) {
1925:       PetscCall(PetscFree(sbuf_aj[0]));
1926:       PetscCall(PetscFree(sbuf_aj));
1927:     }
1928:   }
1929:   PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1930:   PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1931:   PetscFunctionReturn(PETSC_SUCCESS);
1932: }

1934: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935: {
1936:   PetscInt  ncol;
1937:   PetscBool colflag, allcolumns = PETSC_FALSE;

1939:   PetscFunctionBegin;
1940:   /* Allocate memory to hold all the submatrices */
1941:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));

1943:   /* Check for special case: each processor gets entire matrix columns */
1944:   PetscCall(ISIdentity(iscol[0], &colflag));
1945:   PetscCall(ISGetLocalSize(iscol[0], &ncol));
1946:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1948:   PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1949:   PetscFunctionReturn(PETSC_SUCCESS);
1950: }

1952: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1953: {
1954:   PetscInt     nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1955:   PetscBool    rowflag, colflag, wantallmatrix = PETSC_FALSE;
1956:   Mat_SeqAIJ  *subc;
1957:   Mat_SubSppt *smat;

1959:   PetscFunctionBegin;
1960:   /* Check for special case: each processor has a single IS */
1961:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1962:     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1963:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1964:     PetscFunctionReturn(PETSC_SUCCESS);
1965:   }

1967:   /* Collect global wantallmatrix and nstages */
1968:   if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1969:   else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1970:   if (!nmax) nmax = 1;

1972:   if (scall == MAT_INITIAL_MATRIX) {
1973:     /* Collect global wantallmatrix and nstages */
1974:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1975:       PetscCall(ISIdentity(*isrow, &rowflag));
1976:       PetscCall(ISIdentity(*iscol, &colflag));
1977:       PetscCall(ISGetLocalSize(*isrow, &nrow));
1978:       PetscCall(ISGetLocalSize(*iscol, &ncol));
1979:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1980:         wantallmatrix = PETSC_TRUE;

1982:         PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1983:       }
1984:     }

1986:     /* Determine the number of stages through which submatrices are done
1987:        Each stage will extract nmax submatrices.
1988:        nmax is determined by the matrix column dimension.
1989:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1990:     */
1991:     nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1993:     in[0] = -1 * (PetscInt)wantallmatrix;
1994:     in[1] = nstages;
1995:     PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1996:     wantallmatrix = (PetscBool)(-out[0]);
1997:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

1999:   } else { /* MAT_REUSE_MATRIX */
2000:     if (ismax) {
2001:       subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2002:       smat = subc->submatis1;
2003:     } else { /* (*submat)[0] is a dummy matrix */
2004:       smat = (Mat_SubSppt *)(*submat)[0]->data;
2005:     }
2006:     if (!smat) {
2007:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2008:       wantallmatrix = PETSC_TRUE;
2009:     } else if (smat->singleis) {
2010:       PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2011:       PetscFunctionReturn(PETSC_SUCCESS);
2012:     } else {
2013:       nstages = smat->nstages;
2014:     }
2015:   }

2017:   if (wantallmatrix) {
2018:     PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2019:     PetscFunctionReturn(PETSC_SUCCESS);
2020:   }

2022:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2023:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));

2025:   for (i = 0, pos = 0; i < nstages; i++) {
2026:     if (pos + nmax <= ismax) max_no = nmax;
2027:     else if (pos >= ismax) max_no = 0;
2028:     else max_no = ismax - pos;

2030:     PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2031:     if (!max_no) {
2032:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2033:         smat          = (Mat_SubSppt *)(*submat)[pos]->data;
2034:         smat->nstages = nstages;
2035:       }
2036:       pos++; /* advance to next dummy matrix if any */
2037:     } else pos += max_no;
2038:   }

2040:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2041:     /* save nstages for reuse */
2042:     subc          = (Mat_SeqAIJ *)(*submat)[0]->data;
2043:     smat          = subc->submatis1;
2044:     smat->nstages = nstages;
2045:   }
2046:   PetscFunctionReturn(PETSC_SUCCESS);
2047: }

2049: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2050: {
2051:   Mat_MPIAIJ      *c = (Mat_MPIAIJ *)C->data;
2052:   Mat              A = c->A;
2053:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2054:   const PetscInt **icol, **irow;
2055:   PetscInt        *nrow, *ncol, start;
2056:   PetscMPIInt      rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2057:   PetscInt       **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2058:   PetscInt         nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2059:   PetscInt       **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2060:   PetscInt       **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2061: #if defined(PETSC_USE_CTABLE)
2062:   PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2063: #else
2064:   PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2065: #endif
2066:   const PetscInt *irow_i;
2067:   PetscInt        ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2068:   MPI_Request    *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2069:   MPI_Request    *r_waits4, *s_waits3, *s_waits4;
2070:   MPI_Comm        comm;
2071:   PetscScalar   **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2072:   PetscMPIInt    *onodes1, *olengths1, end;
2073:   PetscInt      **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2074:   Mat_SubSppt    *smat_i;
2075:   PetscBool      *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2076:   PetscInt       *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;

2078:   PetscFunctionBegin;
2079:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2080:   size = c->size;
2081:   rank = c->rank;

2083:   PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2084:   PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));

2086:   for (i = 0; i < ismax; i++) {
2087:     PetscCall(ISSorted(iscol[i], &issorted[i]));
2088:     if (!issorted[i]) iscsorted = issorted[i];

2090:     PetscCall(ISSorted(isrow[i], &issorted[i]));

2092:     PetscCall(ISGetIndices(isrow[i], &irow[i]));
2093:     PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));

2095:     /* Check for special case: allcolumn */
2096:     PetscCall(ISIdentity(iscol[i], &colflag));
2097:     PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2098:     if (colflag && ncol[i] == C->cmap->N) {
2099:       allcolumns[i] = PETSC_TRUE;
2100:       icol[i]       = NULL;
2101:     } else {
2102:       allcolumns[i] = PETSC_FALSE;
2103:       PetscCall(ISGetIndices(iscol[i], &icol[i]));
2104:     }
2105:   }

2107:   if (scall == MAT_REUSE_MATRIX) {
2108:     /* Assumes new rows are same length as the old rows */
2109:     for (i = 0; i < ismax; i++) {
2110:       PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2111:       subc = (Mat_SeqAIJ *)submats[i]->data;
2112:       PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");

2114:       /* Initial matrix as if empty */
2115:       PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));

2117:       smat_i = subc->submatis1;

2119:       nrqs        = smat_i->nrqs;
2120:       nrqr        = smat_i->nrqr;
2121:       rbuf1       = smat_i->rbuf1;
2122:       rbuf2       = smat_i->rbuf2;
2123:       rbuf3       = smat_i->rbuf3;
2124:       req_source2 = smat_i->req_source2;

2126:       sbuf1 = smat_i->sbuf1;
2127:       sbuf2 = smat_i->sbuf2;
2128:       ptr   = smat_i->ptr;
2129:       tmp   = smat_i->tmp;
2130:       ctr   = smat_i->ctr;

2132:       pa          = smat_i->pa;
2133:       req_size    = smat_i->req_size;
2134:       req_source1 = smat_i->req_source1;

2136:       allcolumns[i] = smat_i->allcolumns;
2137:       row2proc[i]   = smat_i->row2proc;
2138:       rmap[i]       = smat_i->rmap;
2139:       cmap[i]       = smat_i->cmap;
2140:     }

2142:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2143:       PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2144:       smat_i = (Mat_SubSppt *)submats[0]->data;

2146:       nrqs        = smat_i->nrqs;
2147:       nrqr        = smat_i->nrqr;
2148:       rbuf1       = smat_i->rbuf1;
2149:       rbuf2       = smat_i->rbuf2;
2150:       rbuf3       = smat_i->rbuf3;
2151:       req_source2 = smat_i->req_source2;

2153:       sbuf1 = smat_i->sbuf1;
2154:       sbuf2 = smat_i->sbuf2;
2155:       ptr   = smat_i->ptr;
2156:       tmp   = smat_i->tmp;
2157:       ctr   = smat_i->ctr;

2159:       pa          = smat_i->pa;
2160:       req_size    = smat_i->req_size;
2161:       req_source1 = smat_i->req_source1;

2163:       allcolumns[0] = PETSC_FALSE;
2164:     }
2165:   } else { /* scall == MAT_INITIAL_MATRIX */
2166:     /* Get some new tags to keep the communication clean */
2167:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2168:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));

2170:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2171:      required. Based on this, buffers are allocated, and data copied into them*/
2172:     PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */

2174:     for (i = 0; i < ismax; i++) {
2175:       jmax   = nrow[i];
2176:       irow_i = irow[i];

2178:       PetscCall(PetscMalloc1(jmax, &row2proc_i));
2179:       row2proc[i] = row2proc_i;

2181:       if (issorted[i]) proc = 0;
2182:       for (j = 0; j < jmax; j++) {
2183:         if (!issorted[i]) proc = 0;
2184:         row = irow_i[j];
2185:         while (row >= C->rmap->range[proc + 1]) proc++;
2186:         w4[proc]++;
2187:         row2proc_i[j] = proc; /* map row index to proc */
2188:       }
2189:       for (j = 0; j < size; j++) {
2190:         if (w4[j]) {
2191:           w1[j] += w4[j];
2192:           w3[j]++;
2193:           w4[j] = 0;
2194:         }
2195:       }
2196:     }

2198:     nrqs     = 0; /* no of outgoing messages */
2199:     msz      = 0; /* total mesg length (for all procs) */
2200:     w1[rank] = 0; /* no mesg sent to self */
2201:     w3[rank] = 0;
2202:     for (i = 0; i < size; i++) {
2203:       if (w1[i]) {
2204:         w2[i] = 1;
2205:         nrqs++;
2206:       } /* there exists a message to proc i */
2207:     }
2208:     PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2209:     for (i = 0, j = 0; i < size; i++) {
2210:       if (w1[i]) {
2211:         pa[j] = i;
2212:         j++;
2213:       }
2214:     }

2216:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2217:     for (i = 0; i < nrqs; i++) {
2218:       j = pa[i];
2219:       w1[j] += w2[j] + 2 * w3[j];
2220:       msz += w1[j];
2221:     }
2222:     PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));

2224:     /* Determine the number of messages to expect, their lengths, from from-ids */
2225:     PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2226:     PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));

2228:     /* Now post the Irecvs corresponding to these messages */
2229:     PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2230:     PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));

2232:     /* Allocate Memory for outgoing messages */
2233:     PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2234:     PetscCall(PetscArrayzero(sbuf1, size));
2235:     PetscCall(PetscArrayzero(ptr, size));

2237:     {
2238:       PetscInt *iptr = tmp;
2239:       k              = 0;
2240:       for (i = 0; i < nrqs; i++) {
2241:         j = pa[i];
2242:         iptr += k;
2243:         sbuf1[j] = iptr;
2244:         k        = w1[j];
2245:       }
2246:     }

2248:     /* Form the outgoing messages. Initialize the header space */
2249:     for (i = 0; i < nrqs; i++) {
2250:       j           = pa[i];
2251:       sbuf1[j][0] = 0;
2252:       PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2253:       ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2254:     }

2256:     /* Parse the isrow and copy data into outbuf */
2257:     for (i = 0; i < ismax; i++) {
2258:       row2proc_i = row2proc[i];
2259:       PetscCall(PetscArrayzero(ctr, size));
2260:       irow_i = irow[i];
2261:       jmax   = nrow[i];
2262:       for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2263:         proc = row2proc_i[j];
2264:         if (proc != rank) { /* copy to the outgoing buf*/
2265:           ctr[proc]++;
2266:           *ptr[proc] = irow_i[j];
2267:           ptr[proc]++;
2268:         }
2269:       }
2270:       /* Update the headers for the current IS */
2271:       for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2272:         if ((ctr_j = ctr[j])) {
2273:           sbuf1_j            = sbuf1[j];
2274:           k                  = ++sbuf1_j[0];
2275:           sbuf1_j[2 * k]     = ctr_j;
2276:           sbuf1_j[2 * k - 1] = i;
2277:         }
2278:       }
2279:     }

2281:     /*  Now  post the sends */
2282:     PetscCall(PetscMalloc1(nrqs, &s_waits1));
2283:     for (i = 0; i < nrqs; ++i) {
2284:       j = pa[i];
2285:       PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2286:     }

2288:     /* Post Receives to capture the buffer size */
2289:     PetscCall(PetscMalloc1(nrqs, &r_waits2));
2290:     PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2291:     if (nrqs) rbuf2[0] = tmp + msz;
2292:     for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2293:     for (i = 0; i < nrqs; ++i) {
2294:       j = pa[i];
2295:       PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2296:     }

2298:     /* Send to other procs the buf size they should allocate */
2299:     /* Receive messages*/
2300:     PetscCall(PetscMalloc1(nrqr, &s_waits2));
2301:     PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2302:     {
2303:       PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2304:       PetscInt *sbuf2_i;

2306:       PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2307:       for (i = 0; i < nrqr; ++i) {
2308:         req_size[i] = 0;
2309:         rbuf1_i     = rbuf1[i];
2310:         start       = 2 * rbuf1_i[0] + 1;
2311:         end         = olengths1[i];
2312:         PetscCall(PetscMalloc1(end, &sbuf2[i]));
2313:         sbuf2_i = sbuf2[i];
2314:         for (j = start; j < end; j++) {
2315:           id         = rbuf1_i[j] - rstart;
2316:           ncols      = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2317:           sbuf2_i[j] = ncols;
2318:           req_size[i] += ncols;
2319:         }
2320:         req_source1[i] = onodes1[i];
2321:         /* form the header */
2322:         sbuf2_i[0] = req_size[i];
2323:         for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];

2325:         PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2326:       }
2327:     }

2329:     PetscCall(PetscFree(onodes1));
2330:     PetscCall(PetscFree(olengths1));
2331:     PetscCall(PetscFree(r_waits1));
2332:     PetscCall(PetscFree4(w1, w2, w3, w4));

2334:     /* Receive messages*/
2335:     PetscCall(PetscMalloc1(nrqs, &r_waits3));
2336:     PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2337:     for (i = 0; i < nrqs; ++i) {
2338:       PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2339:       req_source2[i] = pa[i];
2340:       PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2341:     }
2342:     PetscCall(PetscFree(r_waits2));

2344:     /* Wait on sends1 and sends2 */
2345:     PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2346:     PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2347:     PetscCall(PetscFree(s_waits1));
2348:     PetscCall(PetscFree(s_waits2));

2350:     /* Now allocate sending buffers for a->j, and send them off */
2351:     PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2352:     for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2353:     if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2354:     for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];

2356:     PetscCall(PetscMalloc1(nrqr, &s_waits3));
2357:     {
2358:       PetscInt  nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2359:       PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2360:       PetscInt  cend = C->cmap->rend;
2361:       PetscInt *a_j = a->j, *b_j = b->j, ctmp;

2363:       for (i = 0; i < nrqr; i++) {
2364:         rbuf1_i   = rbuf1[i];
2365:         sbuf_aj_i = sbuf_aj[i];
2366:         ct1       = 2 * rbuf1_i[0] + 1;
2367:         ct2       = 0;
2368:         for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2369:           kmax = rbuf1[i][2 * j];
2370:           for (k = 0; k < kmax; k++, ct1++) {
2371:             row    = rbuf1_i[ct1] - rstart;
2372:             nzA    = a_i[row + 1] - a_i[row];
2373:             nzB    = b_i[row + 1] - b_i[row];
2374:             ncols  = nzA + nzB;
2375:             cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2376:             cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);

2378:             /* load the column indices for this row into cols */
2379:             cols = sbuf_aj_i + ct2;

2381:             lwrite = 0;
2382:             for (l = 0; l < nzB; l++) {
2383:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2384:             }
2385:             for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2386:             for (l = 0; l < nzB; l++) {
2387:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2388:             }

2390:             ct2 += ncols;
2391:           }
2392:         }
2393:         PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2394:       }
2395:     }

2397:     /* create col map: global col of C -> local col of submatrices */
2398:     {
2399:       const PetscInt *icol_i;
2400: #if defined(PETSC_USE_CTABLE)
2401:       for (i = 0; i < ismax; i++) {
2402:         if (!allcolumns[i]) {
2403:           PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));

2405:           jmax   = ncol[i];
2406:           icol_i = icol[i];
2407:           cmap_i = cmap[i];
2408:           for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2409:         } else cmap[i] = NULL;
2410:       }
2411: #else
2412:       for (i = 0; i < ismax; i++) {
2413:         if (!allcolumns[i]) {
2414:           PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2415:           jmax   = ncol[i];
2416:           icol_i = icol[i];
2417:           cmap_i = cmap[i];
2418:           for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2419:         } else cmap[i] = NULL;
2420:       }
2421: #endif
2422:     }

2424:     /* Create lens which is required for MatCreate... */
2425:     for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2426:     PetscCall(PetscMalloc1(ismax, &lens));

2428:     if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2429:     for (i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);

2431:     /* Update lens from local data */
2432:     for (i = 0; i < ismax; i++) {
2433:       row2proc_i = row2proc[i];
2434:       jmax       = nrow[i];
2435:       if (!allcolumns[i]) cmap_i = cmap[i];
2436:       irow_i = irow[i];
2437:       lens_i = lens[i];
2438:       for (j = 0; j < jmax; j++) {
2439:         row  = irow_i[j];
2440:         proc = row2proc_i[j];
2441:         if (proc == rank) {
2442:           PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2443:           if (!allcolumns[i]) {
2444:             for (k = 0; k < ncols; k++) {
2445: #if defined(PETSC_USE_CTABLE)
2446:               PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2447: #else
2448:               tcol = cmap_i[cols[k]];
2449: #endif
2450:               if (tcol) lens_i[j]++;
2451:             }
2452:           } else { /* allcolumns */
2453:             lens_i[j] = ncols;
2454:           }
2455:           PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2456:         }
2457:       }
2458:     }

2460:     /* Create row map: global row of C -> local row of submatrices */
2461: #if defined(PETSC_USE_CTABLE)
2462:     for (i = 0; i < ismax; i++) {
2463:       PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2464:       irow_i = irow[i];
2465:       jmax   = nrow[i];
2466:       for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2467:     }
2468: #else
2469:     for (i = 0; i < ismax; i++) {
2470:       PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2471:       rmap_i = rmap[i];
2472:       irow_i = irow[i];
2473:       jmax   = nrow[i];
2474:       for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2475:     }
2476: #endif

2478:     /* Update lens from offproc data */
2479:     {
2480:       PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;

2482:       PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2483:       for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2484:         sbuf1_i = sbuf1[pa[tmp2]];
2485:         jmax    = sbuf1_i[0];
2486:         ct1     = 2 * jmax + 1;
2487:         ct2     = 0;
2488:         rbuf2_i = rbuf2[tmp2];
2489:         rbuf3_i = rbuf3[tmp2];
2490:         for (j = 1; j <= jmax; j++) {
2491:           is_no  = sbuf1_i[2 * j - 1];
2492:           max1   = sbuf1_i[2 * j];
2493:           lens_i = lens[is_no];
2494:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2495:           rmap_i = rmap[is_no];
2496:           for (k = 0; k < max1; k++, ct1++) {
2497: #if defined(PETSC_USE_CTABLE)
2498:             PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2499:             row--;
2500:             PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2501: #else
2502:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2503: #endif
2504:             max2 = rbuf2_i[ct1];
2505:             for (l = 0; l < max2; l++, ct2++) {
2506:               if (!allcolumns[is_no]) {
2507: #if defined(PETSC_USE_CTABLE)
2508:                 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2509: #else
2510:                 tcol = cmap_i[rbuf3_i[ct2]];
2511: #endif
2512:                 if (tcol) lens_i[row]++;
2513:               } else {         /* allcolumns */
2514:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2515:               }
2516:             }
2517:           }
2518:         }
2519:       }
2520:     }
2521:     PetscCall(PetscFree(r_waits3));
2522:     PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2523:     PetscCall(PetscFree(s_waits3));

2525:     /* Create the submatrices */
2526:     for (i = 0; i < ismax; i++) {
2527:       PetscInt rbs, cbs;

2529:       PetscCall(ISGetBlockSize(isrow[i], &rbs));
2530:       PetscCall(ISGetBlockSize(iscol[i], &cbs));

2532:       PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2533:       PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));

2535:       if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2536:       PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2537:       PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));

2539:       /* create struct Mat_SubSppt and attached it to submat */
2540:       PetscCall(PetscNew(&smat_i));
2541:       subc            = (Mat_SeqAIJ *)submats[i]->data;
2542:       subc->submatis1 = smat_i;

2544:       smat_i->destroy          = submats[i]->ops->destroy;
2545:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2546:       submats[i]->factortype   = C->factortype;

2548:       smat_i->id          = i;
2549:       smat_i->nrqs        = nrqs;
2550:       smat_i->nrqr        = nrqr;
2551:       smat_i->rbuf1       = rbuf1;
2552:       smat_i->rbuf2       = rbuf2;
2553:       smat_i->rbuf3       = rbuf3;
2554:       smat_i->sbuf2       = sbuf2;
2555:       smat_i->req_source2 = req_source2;

2557:       smat_i->sbuf1 = sbuf1;
2558:       smat_i->ptr   = ptr;
2559:       smat_i->tmp   = tmp;
2560:       smat_i->ctr   = ctr;

2562:       smat_i->pa          = pa;
2563:       smat_i->req_size    = req_size;
2564:       smat_i->req_source1 = req_source1;

2566:       smat_i->allcolumns = allcolumns[i];
2567:       smat_i->singleis   = PETSC_FALSE;
2568:       smat_i->row2proc   = row2proc[i];
2569:       smat_i->rmap       = rmap[i];
2570:       smat_i->cmap       = cmap[i];
2571:     }

2573:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2574:       PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2575:       PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2576:       PetscCall(MatSetType(submats[0], MATDUMMY));

2578:       /* create struct Mat_SubSppt and attached it to submat */
2579:       PetscCall(PetscNew(&smat_i));
2580:       submats[0]->data = (void *)smat_i;

2582:       smat_i->destroy          = submats[0]->ops->destroy;
2583:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2584:       submats[0]->factortype   = C->factortype;

2586:       smat_i->id          = 0;
2587:       smat_i->nrqs        = nrqs;
2588:       smat_i->nrqr        = nrqr;
2589:       smat_i->rbuf1       = rbuf1;
2590:       smat_i->rbuf2       = rbuf2;
2591:       smat_i->rbuf3       = rbuf3;
2592:       smat_i->sbuf2       = sbuf2;
2593:       smat_i->req_source2 = req_source2;

2595:       smat_i->sbuf1 = sbuf1;
2596:       smat_i->ptr   = ptr;
2597:       smat_i->tmp   = tmp;
2598:       smat_i->ctr   = ctr;

2600:       smat_i->pa          = pa;
2601:       smat_i->req_size    = req_size;
2602:       smat_i->req_source1 = req_source1;

2604:       smat_i->allcolumns = PETSC_FALSE;
2605:       smat_i->singleis   = PETSC_FALSE;
2606:       smat_i->row2proc   = NULL;
2607:       smat_i->rmap       = NULL;
2608:       smat_i->cmap       = NULL;
2609:     }

2611:     if (ismax) PetscCall(PetscFree(lens[0]));
2612:     PetscCall(PetscFree(lens));
2613:     if (sbuf_aj) {
2614:       PetscCall(PetscFree(sbuf_aj[0]));
2615:       PetscCall(PetscFree(sbuf_aj));
2616:     }

2618:   } /* endof scall == MAT_INITIAL_MATRIX */

2620:   /* Post recv matrix values */
2621:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2622:   PetscCall(PetscMalloc1(nrqs, &rbuf4));
2623:   PetscCall(PetscMalloc1(nrqs, &r_waits4));
2624:   for (i = 0; i < nrqs; ++i) {
2625:     PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2626:     PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2627:   }

2629:   /* Allocate sending buffers for a->a, and send them off */
2630:   PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2631:   for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2632:   if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2633:   for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];

2635:   PetscCall(PetscMalloc1(nrqr, &s_waits4));
2636:   {
2637:     PetscInt     nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2638:     PetscInt     cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2639:     PetscInt     cend = C->cmap->rend;
2640:     PetscInt    *b_j  = b->j;
2641:     PetscScalar *vworkA, *vworkB, *a_a, *b_a;

2643:     PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2644:     PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2645:     for (i = 0; i < nrqr; i++) {
2646:       rbuf1_i   = rbuf1[i];
2647:       sbuf_aa_i = sbuf_aa[i];
2648:       ct1       = 2 * rbuf1_i[0] + 1;
2649:       ct2       = 0;
2650:       for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2651:         kmax = rbuf1_i[2 * j];
2652:         for (k = 0; k < kmax; k++, ct1++) {
2653:           row    = rbuf1_i[ct1] - rstart;
2654:           nzA    = a_i[row + 1] - a_i[row];
2655:           nzB    = b_i[row + 1] - b_i[row];
2656:           ncols  = nzA + nzB;
2657:           cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2658:           vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2659:           vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);

2661:           /* load the column values for this row into vals*/
2662:           vals = sbuf_aa_i + ct2;

2664:           lwrite = 0;
2665:           for (l = 0; l < nzB; l++) {
2666:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2667:           }
2668:           for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2669:           for (l = 0; l < nzB; l++) {
2670:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2671:           }

2673:           ct2 += ncols;
2674:         }
2675:       }
2676:       PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2677:     }
2678:     PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2679:     PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2680:   }

2682:   /* Assemble the matrices */
2683:   /* First assemble the local rows */
2684:   for (i = 0; i < ismax; i++) {
2685:     row2proc_i = row2proc[i];
2686:     subc       = (Mat_SeqAIJ *)submats[i]->data;
2687:     imat_ilen  = subc->ilen;
2688:     imat_j     = subc->j;
2689:     imat_i     = subc->i;
2690:     imat_a     = subc->a;

2692:     if (!allcolumns[i]) cmap_i = cmap[i];
2693:     rmap_i = rmap[i];
2694:     irow_i = irow[i];
2695:     jmax   = nrow[i];
2696:     for (j = 0; j < jmax; j++) {
2697:       row  = irow_i[j];
2698:       proc = row2proc_i[j];
2699:       if (proc == rank) {
2700:         old_row = row;
2701: #if defined(PETSC_USE_CTABLE)
2702:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2703:         row--;
2704: #else
2705:         row = rmap_i[row];
2706: #endif
2707:         ilen_row = imat_ilen[row];
2708:         PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2709:         mat_i = imat_i[row];
2710:         mat_a = imat_a + mat_i;
2711:         mat_j = imat_j + mat_i;
2712:         if (!allcolumns[i]) {
2713:           for (k = 0; k < ncols; k++) {
2714: #if defined(PETSC_USE_CTABLE)
2715:             PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2716: #else
2717:             tcol = cmap_i[cols[k]];
2718: #endif
2719:             if (tcol) {
2720:               *mat_j++ = tcol - 1;
2721:               *mat_a++ = vals[k];
2722:               ilen_row++;
2723:             }
2724:           }
2725:         } else { /* allcolumns */
2726:           for (k = 0; k < ncols; k++) {
2727:             *mat_j++ = cols[k]; /* global col index! */
2728:             *mat_a++ = vals[k];
2729:             ilen_row++;
2730:           }
2731:         }
2732:         PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));

2734:         imat_ilen[row] = ilen_row;
2735:       }
2736:     }
2737:   }

2739:   /* Now assemble the off proc rows */
2740:   PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2741:   for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2742:     sbuf1_i = sbuf1[pa[tmp2]];
2743:     jmax    = sbuf1_i[0];
2744:     ct1     = 2 * jmax + 1;
2745:     ct2     = 0;
2746:     rbuf2_i = rbuf2[tmp2];
2747:     rbuf3_i = rbuf3[tmp2];
2748:     rbuf4_i = rbuf4[tmp2];
2749:     for (j = 1; j <= jmax; j++) {
2750:       is_no  = sbuf1_i[2 * j - 1];
2751:       rmap_i = rmap[is_no];
2752:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2753:       subc      = (Mat_SeqAIJ *)submats[is_no]->data;
2754:       imat_ilen = subc->ilen;
2755:       imat_j    = subc->j;
2756:       imat_i    = subc->i;
2757:       imat_a    = subc->a;
2758:       max1      = sbuf1_i[2 * j];
2759:       for (k = 0; k < max1; k++, ct1++) {
2760:         row = sbuf1_i[ct1];
2761: #if defined(PETSC_USE_CTABLE)
2762:         PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2763:         row--;
2764: #else
2765:         row = rmap_i[row];
2766: #endif
2767:         ilen  = imat_ilen[row];
2768:         mat_i = imat_i[row];
2769:         mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2770:         mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2771:         max2  = rbuf2_i[ct1];
2772:         if (!allcolumns[is_no]) {
2773:           for (l = 0; l < max2; l++, ct2++) {
2774: #if defined(PETSC_USE_CTABLE)
2775:             PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2776: #else
2777:             tcol = cmap_i[rbuf3_i[ct2]];
2778: #endif
2779:             if (tcol) {
2780:               *mat_j++ = tcol - 1;
2781:               *mat_a++ = rbuf4_i[ct2];
2782:               ilen++;
2783:             }
2784:           }
2785:         } else { /* allcolumns */
2786:           for (l = 0; l < max2; l++, ct2++) {
2787:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2788:             *mat_a++ = rbuf4_i[ct2];
2789:             ilen++;
2790:           }
2791:         }
2792:         imat_ilen[row] = ilen;
2793:       }
2794:     }
2795:   }

2797:   if (!iscsorted) { /* sort column indices of the rows */
2798:     for (i = 0; i < ismax; i++) {
2799:       subc      = (Mat_SeqAIJ *)submats[i]->data;
2800:       imat_j    = subc->j;
2801:       imat_i    = subc->i;
2802:       imat_a    = subc->a;
2803:       imat_ilen = subc->ilen;

2805:       if (allcolumns[i]) continue;
2806:       jmax = nrow[i];
2807:       for (j = 0; j < jmax; j++) {
2808:         mat_i = imat_i[j];
2809:         mat_a = imat_a + mat_i;
2810:         mat_j = imat_j + mat_i;
2811:         PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2812:       }
2813:     }
2814:   }

2816:   PetscCall(PetscFree(r_waits4));
2817:   PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2818:   PetscCall(PetscFree(s_waits4));

2820:   /* Restore the indices */
2821:   for (i = 0; i < ismax; i++) {
2822:     PetscCall(ISRestoreIndices(isrow[i], irow + i));
2823:     if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2824:   }

2826:   for (i = 0; i < ismax; i++) {
2827:     PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2828:     PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2829:   }

2831:   /* Destroy allocated memory */
2832:   if (sbuf_aa) {
2833:     PetscCall(PetscFree(sbuf_aa[0]));
2834:     PetscCall(PetscFree(sbuf_aa));
2835:   }
2836:   PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));

2838:   for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2839:   PetscCall(PetscFree(rbuf4));

2841:   PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2842:   PetscFunctionReturn(PETSC_SUCCESS);
2843: }

2845: /*
2846:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2847:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2848:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2849:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2850:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2851:  state, and needs to be "assembled" later by compressing B's column space.

2853:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2854:  Following this call, C->A & C->B have been created, even if empty.
2855:  */
2856: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2857: {
2858:   /* If making this function public, change the error returned in this function away from _PLIB. */
2859:   Mat_MPIAIJ     *aij;
2860:   Mat_SeqAIJ     *Baij;
2861:   PetscBool       seqaij, Bdisassembled;
2862:   PetscInt        m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2863:   PetscScalar     v;
2864:   const PetscInt *rowindices, *colindices;

2866:   PetscFunctionBegin;
2867:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2868:   if (A) {
2869:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2870:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2871:     if (rowemb) {
2872:       PetscCall(ISGetLocalSize(rowemb, &m));
2873:       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2874:     } else {
2875:       PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2876:     }
2877:     if (dcolemb) {
2878:       PetscCall(ISGetLocalSize(dcolemb, &n));
2879:       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2880:     } else {
2881:       PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2882:     }
2883:   }
2884:   if (B) {
2885:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2886:     PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2887:     if (rowemb) {
2888:       PetscCall(ISGetLocalSize(rowemb, &m));
2889:       PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2890:     } else {
2891:       PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2892:     }
2893:     if (ocolemb) {
2894:       PetscCall(ISGetLocalSize(ocolemb, &n));
2895:       PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2896:     } else {
2897:       PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2898:     }
2899:   }

2901:   aij = (Mat_MPIAIJ *)C->data;
2902:   if (!aij->A) {
2903:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2904:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2905:     PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2906:     PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2907:     PetscCall(MatSetType(aij->A, MATSEQAIJ));
2908:   }
2909:   if (A) {
2910:     PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2911:   } else {
2912:     PetscCall(MatSetUp(aij->A));
2913:   }
2914:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2915:     /*
2916:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2917:       need to "disassemble" B -- convert it to using C's global indices.
2918:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2920:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2921:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2923:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2924:       At least avoid calling MatSetValues() and the implied searches?
2925:     */

2927:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2928: #if defined(PETSC_USE_CTABLE)
2929:       PetscCall(PetscHMapIDestroy(&aij->colmap));
2930: #else
2931:       PetscCall(PetscFree(aij->colmap));
2932:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2933: #endif
2934:       ngcol = 0;
2935:       if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2936:       if (aij->garray) PetscCall(PetscFree(aij->garray));
2937:       PetscCall(VecDestroy(&aij->lvec));
2938:       PetscCall(VecScatterDestroy(&aij->Mvctx));
2939:     }
2940:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2941:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2942:   }
2943:   Bdisassembled = PETSC_FALSE;
2944:   if (!aij->B) {
2945:     PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2946:     PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2947:     PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2948:     PetscCall(MatSetType(aij->B, MATSEQAIJ));
2949:     Bdisassembled = PETSC_TRUE;
2950:   }
2951:   if (B) {
2952:     Baij = (Mat_SeqAIJ *)B->data;
2953:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2954:       PetscCall(PetscMalloc1(B->rmap->n, &nz));
2955:       for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2956:       PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2957:       PetscCall(PetscFree(nz));
2958:     }

2960:     PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2961:     shift      = rend - rstart;
2962:     count      = 0;
2963:     rowindices = NULL;
2964:     colindices = NULL;
2965:     if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2966:     if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2967:     for (i = 0; i < B->rmap->n; i++) {
2968:       PetscInt row;
2969:       row = i;
2970:       if (rowindices) row = rowindices[i];
2971:       for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2972:         col = Baij->j[count];
2973:         if (colindices) col = colindices[col];
2974:         if (Bdisassembled && col >= rstart) col += shift;
2975:         v = Baij->a[count];
2976:         PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2977:         ++count;
2978:       }
2979:     }
2980:     /* No assembly for aij->B is necessary. */
2981:     /* FIXME: set aij->B's nonzerostate correctly. */
2982:   } else {
2983:     PetscCall(MatSetUp(aij->B));
2984:   }
2985:   C->preallocated  = PETSC_TRUE;
2986:   C->was_assembled = PETSC_FALSE;
2987:   C->assembled     = PETSC_FALSE;
2988:   /*
2989:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2990:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2991:    */
2992:   PetscFunctionReturn(PETSC_SUCCESS);
2993: }

2995: /*
2996:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2997:  */
2998: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2999: {
3000:   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;

3002:   PetscFunctionBegin;
3003:   PetscAssertPointer(A, 2);
3004:   PetscAssertPointer(B, 3);
3005:   /* FIXME: make sure C is assembled */
3006:   *A = aij->A;
3007:   *B = aij->B;
3008:   /* Note that we don't incref *A and *B, so be careful! */
3009:   PetscFunctionReturn(PETSC_SUCCESS);
3010: }

3012: /*
3013:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3014:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3015: */
3016: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3017: {
3018:   PetscMPIInt size, flag;
3019:   PetscInt    i, ii, cismax, ispar;
3020:   Mat        *A, *B;
3021:   IS         *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;

3023:   PetscFunctionBegin;
3024:   if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);

3026:   for (i = 0, cismax = 0; i < ismax; ++i) {
3027:     PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3028:     PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3029:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3030:     if (size > 1) ++cismax;
3031:   }

3033:   /*
3034:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3035:      ispar counts the number of parallel ISs across C's comm.
3036:   */
3037:   PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3038:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3039:     PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3040:     PetscFunctionReturn(PETSC_SUCCESS);
3041:   }

3043:   /* if (ispar) */
3044:   /*
3045:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3046:     These are used to extract the off-diag portion of the resulting parallel matrix.
3047:     The row IS for the off-diag portion is the same as for the diag portion,
3048:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3049:   */
3050:   PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3051:   PetscCall(PetscMalloc1(cismax, &ciscol_p));
3052:   for (i = 0, ii = 0; i < ismax; ++i) {
3053:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3054:     if (size > 1) {
3055:       /*
3056:          TODO: This is the part that's ***NOT SCALABLE***.
3057:          To fix this we need to extract just the indices of C's nonzero columns
3058:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3059:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3060:          to be done without serializing on the IS list, so, most likely, it is best
3061:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3062:       */
3063:       PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3064:       /* Now we have to
3065:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3066:              were sorted on each rank, concatenated they might no longer be sorted;
3067:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3068:              indices in the nondecreasing order to the original index positions.
3069:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3070:       */
3071:       PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3072:       PetscCall(ISSort(ciscol[ii]));
3073:       ++ii;
3074:     }
3075:   }
3076:   PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3077:   for (i = 0, ii = 0; i < ismax; ++i) {
3078:     PetscInt        j, issize;
3079:     const PetscInt *indices;

3081:     /*
3082:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3083:      */
3084:     PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3085:     PetscCall(ISSort(isrow[i]));
3086:     PetscCall(ISGetLocalSize(isrow[i], &issize));
3087:     PetscCall(ISGetIndices(isrow[i], &indices));
3088:     for (j = 1; j < issize; ++j) {
3089:       PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3090:     }
3091:     PetscCall(ISRestoreIndices(isrow[i], &indices));
3092:     PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3093:     PetscCall(ISSort(iscol[i]));
3094:     PetscCall(ISGetLocalSize(iscol[i], &issize));
3095:     PetscCall(ISGetIndices(iscol[i], &indices));
3096:     for (j = 1; j < issize; ++j) {
3097:       PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3098:     }
3099:     PetscCall(ISRestoreIndices(iscol[i], &indices));
3100:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3101:     if (size > 1) {
3102:       cisrow[ii] = isrow[i];
3103:       ++ii;
3104:     }
3105:   }
3106:   /*
3107:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3108:     array of sequential matrices underlying the resulting parallel matrices.
3109:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3110:     contain duplicates.

3112:     There are as many diag matrices as there are original index sets. There are only as many parallel
3113:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3115:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3116:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3117:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3118:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3119:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3120:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3121:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3122:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3123:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3124:       values into A[i] and B[ii] sitting inside the corresponding submat.
3125:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3126:       A[i], B[ii], AA[i] or BB[ii] matrices.
3127:   */
3128:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3129:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));

3131:   /* Now obtain the sequential A and B submatrices separately. */
3132:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3133:   PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3134:   PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));

3136:   /*
3137:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3138:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3139:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3140:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3141:     to have the same sparsity pattern.
3142:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3143:     must be constructed for C. This is done by setseqmat(s).
3144:   */
3145:   for (i = 0, ii = 0; i < ismax; ++i) {
3146:     /*
3147:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3148:        That way we can avoid sorting and computing permutations when reusing.
3149:        To this end:
3150:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3151:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3152:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3153:     */
3154:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3156:     PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3157:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3158:     if (size > 1) {
3159:       if (scall == MAT_INITIAL_MATRIX) {
3160:         PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3161:         PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3162:         PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3163:         PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3164:         PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3165:       }
3166:       /*
3167:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3168:       */
3169:       {
3170:         Mat AA = A[i], BB = B[ii];

3172:         if (AA || BB) {
3173:           PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3174:           PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3175:           PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3176:         }
3177:         PetscCall(MatDestroy(&AA));
3178:       }
3179:       PetscCall(ISDestroy(ciscol + ii));
3180:       PetscCall(ISDestroy(ciscol_p + ii));
3181:       ++ii;
3182:     } else { /* if (size == 1) */
3183:       if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3184:       if (isrow_p[i] || iscol_p[i]) {
3185:         PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3186:         PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3187:         /* Otherwise A is extracted straight into (*submats)[i]. */
3188:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3189:         PetscCall(MatDestroy(A + i));
3190:       } else (*submat)[i] = A[i];
3191:     }
3192:     PetscCall(ISDestroy(&isrow_p[i]));
3193:     PetscCall(ISDestroy(&iscol_p[i]));
3194:   }
3195:   PetscCall(PetscFree2(cisrow, ciscol));
3196:   PetscCall(PetscFree2(isrow_p, iscol_p));
3197:   PetscCall(PetscFree(ciscol_p));
3198:   PetscCall(PetscFree(A));
3199:   PetscCall(MatDestroySubMatrices(cismax, &B));
3200:   PetscFunctionReturn(PETSC_SUCCESS);
3201: }

3203: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3204: {
3205:   PetscFunctionBegin;
3206:   PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3207:   PetscFunctionReturn(PETSC_SUCCESS);
3208: }