Actual source code: sbaijov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix.
  3:    Used for finding submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
  6: #include <petscbt.h>

  8: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *);
  9: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *);

 11: PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov)
 12: {
 13:   PetscInt        i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov;
 14:   IS             *is_new, *is_row;
 15:   Mat            *submats;
 16:   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ *)C->data;
 17:   Mat_SeqSBAIJ   *asub_i;
 18:   PetscBT         table;
 19:   PetscInt       *ai, brow, nz, nis, l, nmax, nstages_local, nstages, max_no, pos;
 20:   const PetscInt *idx;
 21:   PetscBool       flg;

 23:   PetscFunctionBegin;
 24:   PetscCall(PetscMalloc1(is_max, &is_new));
 25:   /* Convert the indices into block format */
 26:   PetscCall(ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new));
 27:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");

 29:   /*  previous non-scalable implementation  */
 30:   flg = PETSC_FALSE;
 31:   PetscCall(PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg));
 32:   if (flg) { /* previous non-scalable implementation */
 33:     printf("use previous non-scalable implementation...\n");
 34:     for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new));
 35:   } else { /* implementation using modified BAIJ routines */

 37:     PetscCall(PetscMalloc1(Mbs + 1, &nidx));
 38:     PetscCall(PetscBTCreate(Mbs, &table)); /* for column search */

 40:     /* Create is_row */
 41:     PetscCall(PetscMalloc1(is_max, &is_row));
 42:     PetscCall(ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0]));

 44:     for (i = 1; i < is_max; i++) { is_row[i] = is_row[0]; /* reuse is_row[0] */ }

 46:     /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */
 47:     PetscCall(PetscMalloc1(is_max + 1, &submats));

 49:     /* Determine the number of stages through which submatrices are done */
 50:     nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt));
 51:     if (!nmax) nmax = 1;
 52:     nstages_local = is_max / nmax + ((is_max % nmax) ? 1 : 0);

 54:     /* Make sure every processor loops through the nstages */
 55:     PetscCallMPI(MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));

 57:     {
 58:       const PetscObject obj = (PetscObject)c->A;
 59:       size_t            new_len, cur_len, max_len;

 61:       PetscCall(PetscStrlen(MATSEQBAIJ, &new_len));
 62:       PetscCall(PetscStrlen(MATSEQSBAIJ, &cur_len));
 63:       max_len = PetscMax(cur_len, new_len) + 1;
 64:       PetscCall(PetscRealloc(max_len * sizeof(*obj->type_name), &obj->type_name));
 65:       /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to
 66:          trigger that */
 67:       for (iov = 0; iov < ov; ++iov) {
 68:         /* 1) Get submats for column search */
 69:         PetscCall(PetscStrncpy(obj->type_name, MATSEQBAIJ, max_len));
 70:         for (i = 0, pos = 0; i < nstages; i++) {
 71:           if (pos + nmax <= is_max) max_no = nmax;
 72:           else if (pos == is_max) max_no = 0;
 73:           else max_no = is_max - pos;
 74:           c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */

 76:           PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos, PETSC_TRUE));
 77:           pos += max_no;
 78:         }
 79:         PetscCall(PetscStrncpy(obj->type_name, MATSEQSBAIJ, max_len));

 81:         /* 2) Row search */
 82:         PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new));

 84:         /* 3) Column search */
 85:         for (i = 0; i < is_max; i++) {
 86:           asub_i = (Mat_SeqSBAIJ *)submats[i]->data;
 87:           ai     = asub_i->i;

 89:           /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
 90:           PetscCall(PetscBTMemzero(Mbs, table));

 92:           PetscCall(ISGetIndices(is_new[i], &idx));
 93:           PetscCall(ISGetLocalSize(is_new[i], &nis));
 94:           for (l = 0; l < nis; l++) {
 95:             PetscCall(PetscBTSet(table, idx[l]));
 96:             nidx[l] = idx[l];
 97:           }
 98:           isz = nis;

100:           /* add column entries to table */
101:           for (brow = 0; brow < Mbs; brow++) {
102:             nz = ai[brow + 1] - ai[brow];
103:             if (nz) {
104:               if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow;
105:             }
106:           }
107:           PetscCall(ISRestoreIndices(is_new[i], &idx));
108:           PetscCall(ISDestroy(&is_new[i]));

110:           /* create updated is_new */
111:           PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i));
112:         }

114:         /* Free tmp spaces */
115:         for (i = 0; i < is_max; i++) PetscCall(MatDestroy(&submats[i]));
116:       }

118:       PetscCall(PetscBTDestroy(&table));
119:       PetscCall(PetscFree(submats));
120:       PetscCall(ISDestroy(&is_row[0]));
121:       PetscCall(PetscFree(is_row));
122:       PetscCall(PetscFree(nidx));
123:     }
124:   }
125:   for (i = 0; i < is_max; i++) {
126:     PetscCall(ISDestroy(&is[i]));
127:     PetscCall(ISGetLocalSize(is_new[i], &nis));
128:     PetscCall(ISGetIndices(is_new[i], &idx));
129:     PetscCall(ISCreateBlock(PetscObjectComm((PetscObject)is_new[i]), bs, nis, idx, PETSC_COPY_VALUES, &is[i]));
130:     PetscCall(ISDestroy(&is_new[i]));
131:   }
132:   PetscCall(PetscFree(is_new));
133:   PetscFunctionReturn(PETSC_SUCCESS);
134: }

136: typedef enum {
137:   MINE,
138:   OTHER
139: } WhoseOwner;
140: /*  data1, odata1 and odata2 are packed in the format (for communication):
141:        data[0]          = is_max, no of is
142:        data[1]          = size of is[0]
143:         ...
144:        data[is_max]     = size of is[is_max-1]
145:        data[is_max + 1] = data(is[0])
146:         ...
147:        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
148:         ...
149:    data2 is packed in the format (for creating output is[]):
150:        data[0]          = is_max, no of is
151:        data[1]          = size of is[0]
152:         ...
153:        data[is_max]     = size of is[is_max-1]
154:        data[is_max + 1] = data(is[0])
155:         ...
156:        data[is_max + 1 + Mbs*i) = data(is[i])
157:         ...
158: */
159: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[])
160: {
161:   Mat_MPISBAIJ   *c        = (Mat_MPISBAIJ *)C->data;
162:   PetscMPIInt     proc_end = 0, size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, *iwork;
163:   const PetscInt *idx_i;
164:   PetscInt        idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i, len;
165:   PetscInt        Mbs, i, j, k, *odata1, *odata2;
166:   PetscInt      **odata2_ptr, *ctable = NULL, *btable, len_max, len_est;
167:   PetscInt        len_unused, nodata2;
168:   PetscInt        ois_max; /* max no of is[] in each of processor */
169:   char           *t_p;
170:   MPI_Comm        comm;
171:   MPI_Request    *s_waits1, *s_waits2, r_req;
172:   MPI_Status     *s_status, r_status;
173:   PetscBT        *table; /* mark indices of this processor's is[] */
174:   PetscBT         table_i;
175:   PetscBT         otable; /* mark indices of other processors' is[] */
176:   PetscInt        bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners;
177:   IS              garray_local, garray_gl;

179:   PetscFunctionBegin;
180:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
181:   size = c->size;
182:   rank = c->rank;
183:   Mbs  = c->Mbs;

185:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
186:   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));

188:   /* create tables used in
189:      step 1: table[i] - mark c->garray of proc [i]
190:      step 3: table[i] - mark indices of is[i] when whose=MINE
191:              table[0] - mark incideces of is[] when whose=OTHER */
192:   len = PetscMax(is_max, size);
193:   PetscCall(PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p));
194:   for (i = 0; i < len; i++) table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i;

196:   PetscCallMPI(MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm));

198:   /* 1. Send this processor's is[] to other processors */
199:   /* allocate spaces */
200:   PetscCall(PetscMalloc1(is_max, &n));
201:   len = 0;
202:   for (i = 0; i < is_max; i++) {
203:     PetscCall(ISGetLocalSize(is[i], &n[i]));
204:     len += n[i];
205:   }
206:   if (!len) {
207:     is_max = 0;
208:   } else {
209:     len += 1 + is_max; /* max length of data1 for one processor */
210:   }

212:   PetscCall(PetscMalloc1(size * len + 1, &data1));
213:   PetscCall(PetscMalloc1(size, &data1_start));
214:   for (i = 0; i < size; i++) data1_start[i] = data1 + i * len;

216:   PetscCall(PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners));

218:   /* gather c->garray from all processors */
219:   PetscCall(ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local));
220:   PetscCall(ISAllGather(garray_local, &garray_gl));
221:   PetscCall(ISDestroy(&garray_local));
222:   PetscCallMPI(MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm));

224:   Bowners[0] = 0;
225:   for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i];

227:   if (is_max) {
228:     /* hash table ctable which maps c->row to proc_id) */
229:     PetscCall(PetscMalloc1(Mbs, &ctable));
230:     j = 0;
231:     for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
232:       for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id;
233:     }

235:     /* hash tables marking c->garray */
236:     PetscCall(ISGetIndices(garray_gl, &idx_i));
237:     for (i = 0; i < size; i++) {
238:       table_i = table[i];
239:       PetscCall(PetscBTMemzero(Mbs, table_i));
240:       for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/
241:         PetscCall(PetscBTSet(table_i, idx_i[j]));
242:       }
243:     }
244:     PetscCall(ISRestoreIndices(garray_gl, &idx_i));
245:   } /* if (is_max) */
246:   PetscCall(ISDestroy(&garray_gl));

248:   /* evaluate communication - mesg to who, length, and buffer space */
249:   for (i = 0; i < size; i++) len_s[i] = 0;

251:   /* header of data1 */
252:   for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
253:     iwork[proc_id]        = 0;
254:     *data1_start[proc_id] = is_max;
255:     data1_start[proc_id]++;
256:     for (j = 0; j < is_max; j++) {
257:       if (proc_id == rank) {
258:         *data1_start[proc_id] = n[j];
259:       } else {
260:         *data1_start[proc_id] = 0;
261:       }
262:       data1_start[proc_id]++;
263:     }
264:   }

266:   for (i = 0; i < is_max; i++) {
267:     PetscCall(ISGetIndices(is[i], &idx_i));
268:     for (j = 0; j < n[i]; j++) {
269:       idx                = idx_i[j];
270:       *data1_start[rank] = idx;
271:       data1_start[rank]++; /* for local processing */
272:       PetscCall(PetscMPIIntCast(ctable[idx], &proc_end));
273:       for (PetscMPIInt proc_id = 0; proc_id <= proc_end; proc_id++) {            /* for others to process */
274:         if (proc_id == rank) continue;                                           /* done before this loop */
275:         if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */
276:         *data1_start[proc_id] = idx;
277:         data1_start[proc_id]++;
278:         len_s[proc_id]++;
279:       }
280:     }
281:     /* update header data */
282:     for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) {
283:       if (proc_id == rank) continue;
284:       *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id];
285:       iwork[proc_id]                   = len_s[proc_id];
286:     }
287:     PetscCall(ISRestoreIndices(is[i], &idx_i));
288:   }

290:   nrqs = 0;
291:   nrqr = 0;
292:   for (i = 0; i < size; i++) {
293:     data1_start[i] = data1 + i * len;
294:     if (len_s[i]) {
295:       nrqs++;
296:       len_s[i] += 1 + is_max; /* add no. of header msg */
297:     }
298:   }

300:   for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is[i]));
301:   PetscCall(PetscFree(n));
302:   PetscCall(PetscFree(ctable));

304:   /* Determine the number of messages to expect, their lengths, from from-ids */
305:   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr));
306:   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1));

308:   /*  Now  post the sends */
309:   PetscCall(PetscMalloc2(size, &s_waits1, size, &s_waits2));
310:   k = 0;
311:   for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */
312:     if (len_s[proc_id]) {
313:       PetscCallMPI(MPIU_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k));
314:       k++;
315:     }
316:   }

318:   /* 2. Receive other's is[] and process. Then send back */
319:   len = 0;
320:   for (i = 0; i < nrqr; i++) {
321:     if (len_r1[i] > len) len = len_r1[i];
322:   }
323:   PetscCall(PetscFree(len_r1));
324:   PetscCall(PetscFree(id_r1));

326:   for (PetscMPIInt proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;

328:   PetscCall(PetscMalloc1(len + 1, &odata1));
329:   PetscCall(PetscMalloc1(size, &odata2_ptr));
330:   PetscCall(PetscBTCreate(Mbs, &otable));

332:   len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */
333:   len_est = 2 * len_max;         /* estimated space of storing is[] for all receiving messages */
334:   PetscCall(PetscMalloc1(len_est + 1, &odata2));
335:   nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */

337:   odata2_ptr[nodata2] = odata2;

339:   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */

341:   k = 0;
342:   while (k < nrqr) {
343:     PetscMPIInt ilen;

345:     /* Receive messages */
346:     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status));
347:     if (flag) {
348:       PetscMPIInt proc_id;

350:       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &ilen));
351:       proc_id = r_status.MPI_SOURCE;
352:       PetscCallMPI(MPIU_Irecv(odata1, ilen, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
353:       PetscCallMPI(MPI_Wait(&r_req, &r_status));

355:       /*  Process messages */
356:       /*  make sure there is enough unused space in odata2 array */
357:       if (len_unused < len_max) { /* allocate more space for odata2 */
358:         PetscCall(PetscMalloc1(len_est + 1, &odata2));
359:         odata2_ptr[++nodata2] = odata2;
360:         len_unused            = len_est;
361:       }

363:       PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable));
364:       len = 1 + odata2[0];
365:       for (i = 0; i < odata2[0]; i++) len += odata2[1 + i];

367:       /* Send messages back */
368:       PetscCallMPI(MPIU_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k));
369:       k++;
370:       odata2 += len;
371:       len_unused -= len;
372:       PetscCall(PetscMPIIntCast(len, &len_s[proc_id])); /* length of message sending back to proc_id */
373:     }
374:   }
375:   PetscCall(PetscFree(odata1));
376:   PetscCall(PetscBTDestroy(&otable));

378:   /* 3. Do local work on this processor's is[] */
379:   /* make sure there is enough unused space in odata2(=data) array */
380:   len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */
381:   if (len_unused < len_max) {   /* allocate more space for odata2 */
382:     PetscCall(PetscMalloc1(len_est + 1, &odata2));

384:     odata2_ptr[++nodata2] = odata2;
385:   }

387:   data = odata2;
388:   PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table));
389:   PetscCall(PetscFree(data1_start));

391:   /* 4. Receive work done on other processors, then merge */
392:   /* get max number of messages that this processor expects to recv */
393:   PetscCallMPI(MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm));
394:   PetscCall(PetscMalloc1(iwork[rank] + 1, &data2));
395:   PetscCall(PetscFree4(len_s, btable, iwork, Bowners));

397:   k = 0;
398:   while (k < nrqs) {
399:     /* Receive messages */
400:     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status));
401:     if (flag) {
402:       PetscMPIInt proc_id, ilen;
403:       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &ilen));
404:       proc_id = r_status.MPI_SOURCE;
405:       PetscCallMPI(MPIU_Irecv(data2, ilen, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
406:       PetscCallMPI(MPI_Wait(&r_req, &r_status));
407:       if (ilen > 1 + is_max) { /* Add data2 into data */
408:         data2_i = data2 + 1 + is_max;
409:         for (i = 0; i < is_max; i++) {
410:           table_i = table[i];
411:           data_i  = data + 1 + is_max + Mbs * i;
412:           isz     = data[1 + i];
413:           for (j = 0; j < data2[1 + i]; j++) {
414:             col = data2_i[j];
415:             if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col;
416:           }
417:           data[1 + i] = isz;
418:           if (i < is_max - 1) data2_i += data2[1 + i];
419:         }
420:       }
421:       k++;
422:     }
423:   }
424:   PetscCall(PetscFree(data2));
425:   PetscCall(PetscFree2(table, t_p));

427:   /* phase 1 sends are complete */
428:   PetscCall(PetscMalloc1(size, &s_status));
429:   if (nrqs) PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status));
430:   PetscCall(PetscFree(data1));

432:   /* phase 2 sends are complete */
433:   if (nrqr) PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status));
434:   PetscCall(PetscFree2(s_waits1, s_waits2));
435:   PetscCall(PetscFree(s_status));

437:   /* 5. Create new is[] */
438:   for (i = 0; i < is_max; i++) {
439:     data_i = data + 1 + is_max + Mbs * i;
440:     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i));
441:   }
442:   for (k = 0; k <= nodata2; k++) PetscCall(PetscFree(odata2_ptr[k]));
443:   PetscCall(PetscFree(odata2_ptr));
444:   PetscFunctionReturn(PETSC_SUCCESS);
445: }

447: /*
448:    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
449:        the work on the local processor.

451:      Inputs:
452:       C      - MAT_MPISBAIJ;
453:       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
454:       whose  - whose is[] to be processed,
455:                MINE:  this processor's is[]
456:                OTHER: other processor's is[]
457:      Output:
458:        nidx  - whose = MINE:
459:                      holds input and newly found indices in the same format as data
460:                whose = OTHER:
461:                      only holds the newly found indices
462:        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
463: */
464: /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
465: static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table)
466: {
467:   Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
468:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)c->A->data;
469:   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ *)c->B->data;
470:   PetscInt      row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l;
471:   PetscInt      a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n;
472:   PetscBT       table0;  /* mark the indices of input is[] for look up */
473:   PetscBT       table_i; /* points to i-th table. When whose=OTHER, a single table is used for all is[] */

475:   PetscFunctionBegin;
476:   Mbs    = c->Mbs;
477:   mbs    = a->mbs;
478:   ai     = a->i;
479:   aj     = a->j;
480:   bi     = b->i;
481:   bj     = b->j;
482:   garray = c->garray;
483:   rstart = c->rstartbs;
484:   is_max = data[0];

486:   PetscCall(PetscBTCreate(Mbs, &table0));

488:   nidx[0] = is_max;
489:   idx_i   = data + is_max + 1;   /* ptr to input is[0] array */
490:   nidx_i  = nidx + is_max + 1;   /* ptr to output is[0] array */
491:   for (i = 0; i < is_max; i++) { /* for each is */
492:     isz = 0;
493:     n   = data[1 + i]; /* size of input is[i] */

495:     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
496:     if (whose == MINE) { /* process this processor's is[] */
497:       table_i = table[i];
498:       nidx_i  = nidx + 1 + is_max + Mbs * i;
499:     } else { /* process other processor's is[] - only use one temp table */
500:       table_i = table[0];
501:     }
502:     PetscCall(PetscBTMemzero(Mbs, table_i));
503:     PetscCall(PetscBTMemzero(Mbs, table0));
504:     if (n == 0) {
505:       nidx[1 + i] = 0; /* size of new is[i] */
506:       continue;
507:     }

509:     isz0    = 0;
510:     col_max = 0;
511:     for (j = 0; j < n; j++) {
512:       col = idx_i[j];
513:       PetscCheck(col < Mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index col %" PetscInt_FMT " >= Mbs %" PetscInt_FMT, col, Mbs);
514:       if (!PetscBTLookupSet(table_i, col)) {
515:         PetscCall(PetscBTSet(table0, col));
516:         if (whose == MINE) nidx_i[isz0] = col;
517:         if (col_max < col) col_max = col;
518:         isz0++;
519:       }
520:     }

522:     if (whose == MINE) isz = isz0;
523:     k = 0; /* no. of indices from input is[i] that have been examined */
524:     for (row = 0; row < mbs; row++) {
525:       a_start = ai[row];
526:       a_end   = ai[row + 1];
527:       b_start = bi[row];
528:       b_end   = bi[row + 1];
529:       if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]:
530:                                                 do row search: collect all col in this row */
531:         for (l = a_start; l < a_end; l++) {      /* Amat */
532:           col = aj[l] + rstart;
533:           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
534:         }
535:         for (l = b_start; l < b_end; l++) { /* Bmat */
536:           col = garray[bj[l]];
537:           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
538:         }
539:         k++;
540:         if (k >= isz0) break;               /* for (row=0; row<mbs; row++) */
541:       } else {                              /* row is not on input is[i]:
542:                   do col search: add row onto nidx_i if there is a col in nidx_i */
543:         for (l = a_start; l < a_end; l++) { /* Amat */
544:           col = aj[l] + rstart;
545:           if (col > col_max) break;
546:           if (PetscBTLookup(table0, col)) {
547:             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
548:             break; /* for l = start; l<end ; l++) */
549:           }
550:         }
551:         for (l = b_start; l < b_end; l++) { /* Bmat */
552:           col = garray[bj[l]];
553:           if (col > col_max) break;
554:           if (PetscBTLookup(table0, col)) {
555:             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
556:             break; /* for l = start; l<end ; l++) */
557:           }
558:         }
559:       }
560:     }

562:     if (i < is_max - 1) {
563:       idx_i += n;    /* ptr to input is[i+1] array */
564:       nidx_i += isz; /* ptr to output is[i+1] array */
565:     }
566:     nidx[1 + i] = isz; /* size of new is[i] */
567:   } /* for each is */
568:   PetscCall(PetscBTDestroy(&table0));
569:   PetscFunctionReturn(PETSC_SUCCESS);
570: }