Actual source code: vscat.c

  1: #include <petsc/private/sfimpl.h>
  2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  3: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  4: #include <petsc/private/vecimpl.h>

  6: PETSC_EXTERN PetscErrorCode PetscGatherNumberOfMessages_Private(MPI_Comm, const PetscMPIInt[], const PetscInt[], PetscMPIInt *);
  7: PETSC_EXTERN PetscErrorCode PetscGatherMessageLengths_Private(MPI_Comm, PetscMPIInt, PetscMPIInt, const PetscInt[], PetscMPIInt **, PetscInt **);

  9: typedef enum {
 10:   IS_INVALID,
 11:   IS_GENERAL,
 12:   IS_BLOCK,
 13:   IS_STRIDE
 14: } ISTypeID;

 16: static inline PetscErrorCode ISGetTypeID_Private(IS is, ISTypeID *id)
 17: {
 18:   PetscBool same;

 20:   PetscFunctionBegin;
 21:   *id = IS_INVALID;
 22:   PetscCall(PetscObjectTypeCompare((PetscObject)is, ISGENERAL, &same));
 23:   if (same) {
 24:     *id = IS_GENERAL;
 25:     goto functionend;
 26:   }
 27:   PetscCall(PetscObjectTypeCompare((PetscObject)is, ISBLOCK, &same));
 28:   if (same) {
 29:     *id = IS_BLOCK;
 30:     goto functionend;
 31:   }
 32:   PetscCall(PetscObjectTypeCompare((PetscObject)is, ISSTRIDE, &same));
 33:   if (same) {
 34:     *id = IS_STRIDE;
 35:     goto functionend;
 36:   }
 37: functionend:
 38:   PetscFunctionReturn(PETSC_SUCCESS);
 39: }

 41: static PetscErrorCode VecScatterBegin_Internal(VecScatter sf, Vec x, Vec y, InsertMode addv, ScatterMode mode)
 42: {
 43:   PetscSF      wsf = NULL; /* either sf or its local part */
 44:   MPI_Op       mop = MPI_OP_NULL;
 45:   PetscMPIInt  size;
 46:   PetscMemType xmtype = PETSC_MEMTYPE_HOST, ymtype = PETSC_MEMTYPE_HOST;

 48:   PetscFunctionBegin;
 49:   if (x != y) PetscCall(VecLockReadPush(x));
 50:   PetscCall(VecGetArrayReadAndMemType(x, &sf->vscat.xdata, &xmtype));
 51:   PetscCall(VecGetArrayAndMemType(y, &sf->vscat.ydata, &ymtype));
 52:   PetscCall(VecLockWriteSet(y, PETSC_TRUE));

 54:   /* SCATTER_FORWARD_LOCAL indicates ignoring inter-process communication */
 55:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
 56:   if ((mode & SCATTER_FORWARD_LOCAL) && size > 1) { /* Lazy creation of sf->vscat.lsf since SCATTER_FORWARD_LOCAL is uncommon */
 57:     if (!sf->vscat.lsf) PetscCall(PetscSFCreateLocalSF_Private(sf, &sf->vscat.lsf));
 58:     wsf = sf->vscat.lsf;
 59:   } else {
 60:     wsf = sf;
 61:   }

 63:   /* Note xdata/ydata is always recorded on sf (not lsf) above */
 64:   if (addv == INSERT_VALUES) mop = MPI_REPLACE;
 65:   else if (addv == ADD_VALUES) mop = MPIU_SUM; /* Petsc defines its own MPI datatype and SUM operation for __float128 etc. */
 66:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 67:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 68:   else SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "Unsupported InsertMode %d in VecScatterBegin/End", addv);

 70:   if (mode & SCATTER_REVERSE) { /* REVERSE indicates leaves to root scatter. Note that x and y are swapped in input */
 71:     PetscCall(PetscSFReduceWithMemTypeBegin(wsf, sf->vscat.unit, xmtype, sf->vscat.xdata, ymtype, sf->vscat.ydata, mop));
 72:   } else { /* FORWARD indicates x to y scatter, where x is root and y is leaf */
 73:     PetscCall(PetscSFBcastWithMemTypeBegin(wsf, sf->vscat.unit, xmtype, sf->vscat.xdata, ymtype, sf->vscat.ydata, mop));
 74:   }
 75:   PetscFunctionReturn(PETSC_SUCCESS);
 76: }

 78: static PetscErrorCode VecScatterEnd_Internal(VecScatter sf, Vec x, Vec y, InsertMode addv, ScatterMode mode)
 79: {
 80:   PetscSF     wsf = NULL;
 81:   MPI_Op      mop = MPI_OP_NULL;
 82:   PetscMPIInt size;

 84:   PetscFunctionBegin;
 85:   /* SCATTER_FORWARD_LOCAL indicates ignoring inter-process communication */
 86:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
 87:   wsf = ((mode & SCATTER_FORWARD_LOCAL) && size > 1) ? sf->vscat.lsf : sf;

 89:   if (addv == INSERT_VALUES) mop = MPI_REPLACE;
 90:   else if (addv == ADD_VALUES) mop = MPIU_SUM;
 91:   else if (addv == MAX_VALUES) mop = MPIU_MAX;
 92:   else if (addv == MIN_VALUES) mop = MPIU_MIN;
 93:   else SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "Unsupported InsertMode %d in VecScatterBegin/End", addv);

 95:   if (mode & SCATTER_REVERSE) { /* reverse scatter sends leaves to roots. Note that x and y are swapped in input */
 96:     PetscCall(PetscSFReduceEnd(wsf, sf->vscat.unit, sf->vscat.xdata, sf->vscat.ydata, mop));
 97:   } else { /* forward scatter sends roots to leaves, i.e., x to y */
 98:     PetscCall(PetscSFBcastEnd(wsf, sf->vscat.unit, sf->vscat.xdata, sf->vscat.ydata, mop));
 99:   }

101:   PetscCall(VecRestoreArrayReadAndMemType(x, &sf->vscat.xdata));
102:   if (x != y) PetscCall(VecLockReadPop(x));
103:   PetscCall(VecRestoreArrayAndMemType(y, &sf->vscat.ydata));
104:   PetscCall(VecLockWriteSet(y, PETSC_FALSE));
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }

108: /* VecScatterRemap provides a light way to slightly modify a VecScatter. Suppose the input sf scatters
109:    x[i] to y[j], tomap gives a plan to change vscat to scatter x[tomap[i]] to y[j]. Note that in SF,
110:    x is roots. That means we need to change incoming stuffs such as bas->irootloc[].
111:  */
112: static PetscErrorCode VecScatterRemap_Internal(VecScatter sf, const PetscInt *tomap, const PetscInt *frommap)
113: {
114:   PetscInt       i, bs = sf->vscat.bs;
115:   PetscMPIInt    size;
116:   PetscBool      ident = PETSC_TRUE, isbasic, isneighbor;
117:   PetscSFType    type;
118:   PetscSF_Basic *bas = NULL;

120:   PetscFunctionBegin;
121:   /* check if it is an identity map. If it is, do nothing */
122:   if (tomap) {
123:     for (i = 0; i < sf->nroots * bs; i++) {
124:       if (i != tomap[i]) {
125:         ident = PETSC_FALSE;
126:         break;
127:       }
128:     }
129:     if (ident) PetscFunctionReturn(PETSC_SUCCESS);
130:   }
131:   PetscCheck(!frommap, PETSC_COMM_SELF, PETSC_ERR_SUP, "Unable to remap the FROM in scatters yet");
132:   if (!tomap) PetscFunctionReturn(PETSC_SUCCESS);

134:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));

136:   /* Since the indices changed, we must also update the local SF. But we do not do it since
137:      lsf is rarely used. We just destroy lsf and rebuild it on demand from updated sf.
138:   */
139:   if (sf->vscat.lsf) PetscCall(PetscSFDestroy(&sf->vscat.lsf));

141:   PetscCall(PetscSFGetType(sf, &type));
142:   PetscCall(PetscObjectTypeCompare((PetscObject)sf, PETSCSFBASIC, &isbasic));
143:   PetscCall(PetscObjectTypeCompare((PetscObject)sf, PETSCSFNEIGHBOR, &isneighbor));
144:   PetscCheck(isbasic || isneighbor, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "VecScatterRemap on SF type %s is not supported", type);

146:   PetscCall(PetscSFSetUp(sf)); /* to build sf->irootloc if SetUp is not yet called */

148:   /* Root indices are going to be remapped. This is tricky for SF. Root indices are used in sf->rremote,
149:     sf->remote and bas->irootloc. The latter one is cheap to remap, but the former two are not.
150:     To remap them, we have to do a bcast from roots to leaves, to let leaves know their updated roots.
151:     Since VecScatterRemap is supposed to be a cheap routine to adapt a vecscatter by only changing where
152:     x[] data is taken, we do not remap sf->rremote, sf->remote. The consequence is that operations
153:     accessing them (such as PetscSFCompose) may get stale info. Considering VecScatter does not need
154:     that complicated SF operations, we do not remap sf->rremote, sf->remote, instead we destroy them
155:     so that code accessing them (if any) will crash (instead of get silent errors). Note that BcastAndOp/Reduce,
156:     which are used by VecScatter and only rely on bas->irootloc, are updated and correct.
157:   */
158:   sf->remote = NULL;
159:   PetscCall(PetscFree(sf->remote_alloc));
160:   /* Not easy to free sf->rremote since it was allocated with PetscMalloc4(), so just give it crazy values */
161:   for (i = 0; i < sf->roffset[sf->nranks]; i++) sf->rremote[i] = PETSC_INT_MIN;

163:   /* Indices in tomap[] are for each individual vector entry. But indices in sf are for each
164:      block in the vector. So before the remapping, we have to expand indices in sf by bs, and
165:      after the remapping, we have to shrink them back.
166:    */
167:   bas = (PetscSF_Basic *)sf->data;
168:   for (i = 0; i < bas->ioffset[bas->niranks]; i++) bas->irootloc[i] = tomap[bas->irootloc[i] * bs] / bs;
169: #if defined(PETSC_HAVE_DEVICE)
170:   /* Free the irootloc copy on device. We allocate a new copy and get the updated value on demand. See PetscSFLinkGetRootPackOptAndIndices() */
171:   for (i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[i]));
172: #endif
173:   /* Destroy and then rebuild root packing optimizations since indices are changed */
174:   PetscCall(PetscSFResetPackFields(sf));
175:   PetscCall(PetscSFSetUpPackFields(sf));
176:   PetscFunctionReturn(PETSC_SUCCESS);
177: }

179: /*
180:   Given a parallel VecScatter context, return number of procs and vector entries involved in remote (i.e., off-process) communication

182:   Input Parameters:
183: + sf   - the context (must be a parallel vecscatter)
184: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

186:   Output parameters:
187: + num_procs   - number of remote processors
188: - num_entries - number of vector entries to send or recv

190:   Notes:
191:   Sometimes PETSc internally needs to use the matrix-vector-multiply vecscatter context for other purposes. The client code
192:   usually only uses MPI_Send/Recv. This group of subroutines provides info needed for such uses.

194: .seealso: [](sec_scatter), `VecScatterGetRemote_Private()`, `VecScatterGetRemoteOrdered_Private()`
195:  */
196: PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter sf, PetscBool send, PetscInt *num_procs, PetscInt *num_entries)
197: {
198:   PetscMPIInt        nranks, remote_start;
199:   PetscMPIInt        rank;
200:   const PetscInt    *offset;
201:   const PetscMPIInt *ranks;

203:   PetscFunctionBegin;
204:   PetscCall(PetscSFSetUp(sf));
205:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));

207:   /* This routine is mainly used for MatMult's Mvctx. In Mvctx, we scatter an MPI vector x to a sequential vector lvec.
208:      Remember x is roots and lvec is leaves. 'send' means roots to leaves communication. If 'send' is true, we need to
209:      get info about which ranks this processor needs to send to. In other words, we need to call PetscSFGetLeafRanks().
210:      If send is false, we do the opposite, calling PetscSFGetRootRanks().
211:   */
212:   if (send) PetscCall(PetscSFGetLeafRanks(sf, &nranks, &ranks, &offset, NULL));
213:   else PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, &offset, NULL, NULL));
214:   if (nranks) {
215:     remote_start = (rank == ranks[0]) ? 1 : 0;
216:     if (num_procs) *num_procs = nranks - remote_start;
217:     if (num_entries) *num_entries = offset[nranks] - offset[remote_start];
218:   } else {
219:     if (num_procs) *num_procs = 0;
220:     if (num_entries) *num_entries = 0;
221:   }
222:   PetscFunctionReturn(PETSC_SUCCESS);
223: }

225: /* Given a parallel VecScatter context, return a plan that represents the remote communication.
226:    Any output parameter can be NULL.

228:   Input Parameters:
229: + sf   - the context
230: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

232:   Output parameters:
233: + n        - number of remote processors
234: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
235:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
236:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
237:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
238: . indices  - indices of entries to send/recv
239: . procs    - ranks of remote processors
240: - bs       - block size

242: .seealso: `VecScatterRestoreRemote_Private()`, `VecScatterGetRemoteOrdered_Private()`
243:  */
244: PetscErrorCode VecScatterGetRemote_Private(VecScatter sf, PetscBool send, PetscMPIInt *n, const PetscInt **starts, const PetscInt **indices, const PetscMPIInt **procs, PetscInt *bs)
245: {
246:   PetscMPIInt        nranks, remote_start;
247:   PetscMPIInt        rank;
248:   const PetscInt    *offset, *location;
249:   const PetscMPIInt *ranks;

251:   PetscFunctionBegin;
252:   PetscCall(PetscSFSetUp(sf));
253:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));

255:   if (send) PetscCall(PetscSFGetLeafRanks(sf, &nranks, &ranks, &offset, &location));
256:   else PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, &offset, &location, NULL));

258:   if (nranks) {
259:     remote_start = (rank == ranks[0]) ? 1 : 0;
260:     if (n) *n = nranks - remote_start;
261:     if (starts) *starts = &offset[remote_start];
262:     if (indices) *indices = location; /* not &location[offset[remote_start]]. Starts[0] may point to the middle of indices[] */
263:     if (procs) *procs = &ranks[remote_start];
264:   } else {
265:     if (n) *n = 0;
266:     if (starts) *starts = NULL;
267:     if (indices) *indices = NULL;
268:     if (procs) *procs = NULL;
269:   }

271:   if (bs) *bs = 1;
272:   PetscFunctionReturn(PETSC_SUCCESS);
273: }

275: /* Given a parallel VecScatter context, return a plan that represents the remote communication. Ranks of remote
276:    processors returned in procs must be sorted in ascending order. Any output parameter can be NULL.

278:   Input Parameters:
279: + sf   - the context
280: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

282:   Output parameters:
283: + n        - number of remote processors
284: . starts   - starting point in indices for each proc. ATTENTION: starts[0] is not necessarily zero.
285:              Therefore, expressions like starts[i+1]-starts[i] and indices[starts[i]+j] work as
286:              expected for a CSR structure but buf[starts[i]+j] may be out of range if buf was allocated
287:              with length starts[n]-starts[0]. One should use buf[starts[i]-starts[0]+j] instead.
288: . indices  - indices of entries to send/recv
289: . procs    - ranks of remote processors
290: - bs       - block size

292:   Notes:
293:   Output parameters like starts, indices must also be adapted according to the sorted ranks.

295: .seealso: `VecScatterRestoreRemoteOrdered_Private()`, `VecScatterGetRemote_Private()`
296:  */
297: PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter sf, PetscBool send, PetscMPIInt *n, const PetscInt **starts, const PetscInt **indices, const PetscMPIInt **procs, PetscInt *bs)
298: {
299:   PetscFunctionBegin;
300:   PetscCall(VecScatterGetRemote_Private(sf, send, n, starts, indices, procs, bs));
301:   if (PetscUnlikelyDebug(n && procs)) {
302:     PetscMPIInt i;
303:     /* from back to front to also handle cases *n=0 */
304:     for (i = *n - 1; i > 0; i--) PetscCheck((*procs)[i - 1] <= (*procs)[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "procs[] are not ordered");
305:   }
306:   PetscFunctionReturn(PETSC_SUCCESS);
307: }

309: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemote_Private. This gives a chance for
310:    an implementation to free memory allocated in the VecScatterGetRemote_Private call.

312:   Input Parameters:
313: + sf   - the context
314: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

316:   Output parameters:
317: + n        - number of remote processors
318: . starts   - starting point in indices for each proc
319: . indices  - indices of entries to send/recv
320: . procs    - ranks of remote processors
321: - bs       - block size

323: .seealso: `VecScatterGetRemote_Private()`
324:  */
325: PetscErrorCode VecScatterRestoreRemote_Private(VecScatter sf, PetscBool send, PetscMPIInt *n, const PetscInt **starts, const PetscInt **indices, const PetscMPIInt **procs, PetscInt *bs)
326: {
327:   PetscFunctionBegin;
328:   if (starts) *starts = NULL;
329:   if (indices) *indices = NULL;
330:   if (procs) *procs = NULL;
331:   PetscFunctionReturn(PETSC_SUCCESS);
332: }

334: /* Given a parallel VecScatter context, restore the plan returned by VecScatterGetRemoteOrdered_Private. This gives a chance for
335:    an implementation to free memory allocated in the VecScatterGetRemoteOrdered_Private call.

337:   Input Parameters:
338: + sf   - the context
339: - send  - true to select the send info (i.e., todata), otherwise to select the recv info (i.e., fromdata)

341:   Output parameters:
342: + n        - number of remote processors
343: . starts   - starting point in indices for each proc
344: . indices  - indices of entries to send/recv
345: . procs    - ranks of remote processors
346: - bs       - block size

348: .seealso: `VecScatterGetRemoteOrdered_Private()`
349:  */
350: PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter sf, PetscBool send, PetscMPIInt *n, const PetscInt **starts, const PetscInt **indices, const PetscMPIInt **procs, PetscInt *bs)
351: {
352:   PetscFunctionBegin;
353:   PetscCall(VecScatterRestoreRemote_Private(sf, send, n, starts, indices, procs, bs));
354:   PetscFunctionReturn(PETSC_SUCCESS);
355: }

357: /*@
358:   VecScatterSetUp - Sets up the `VecScatter` to be able to actually scatter information between vectors

360:   Collective

362:   Input Parameter:
363: . sf - the scatter context

365:   Level: intermediate

367: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterCopy()`
368: @*/
369: PetscErrorCode VecScatterSetUp(VecScatter sf)
370: {
371:   PetscFunctionBegin;
372:   PetscCall(PetscSFSetUp(sf));
373:   PetscFunctionReturn(PETSC_SUCCESS);
374: }

376: /*@
377:   VecScatterSetType - Builds a vector scatter, for a particular vector scatter implementation.

379:   Collective

381:   Input Parameters:
382: + sf   - The `VecScatter` object
383: - type - The name of the vector scatter type

385:   Options Database Key:
386: . -sf_type <type> - Sets the `VecScatterType`

388:   Level: intermediate

390:   Note:
391:   Use `VecScatterDuplicate()` to form additional vectors scatter of the same type as an existing vector scatter.

393: .seealso: [](sec_scatter), `VecScatter`, `VecScatterType`, `VecScatterGetType()`, `VecScatterCreate()`
394: @*/
395: PetscErrorCode VecScatterSetType(VecScatter sf, VecScatterType type)
396: {
397:   PetscFunctionBegin;
398:   PetscCall(PetscSFSetType(sf, type));
399:   PetscFunctionReturn(PETSC_SUCCESS);
400: }

402: /*@
403:   VecScatterGetType - Gets the vector scatter type name (as a string) from the `VecScatter`.

405:   Not Collective

407:   Input Parameter:
408: . sf - The vector scatter

410:   Output Parameter:
411: . type - The vector scatter type name

413:   Level: intermediate

415: .seealso: [](sec_scatter), `VecScatter`, `VecScatterType`, `VecScatterSetType()`, `VecScatterCreate()`
416: @*/
417: PetscErrorCode VecScatterGetType(VecScatter sf, VecScatterType *type)
418: {
419:   PetscFunctionBegin;
420:   PetscCall(PetscSFGetType(sf, type));
421:   PetscFunctionReturn(PETSC_SUCCESS);
422: }

424: /*@C
425:   VecScatterRegister -  Adds a new vector scatter component implementation

427:   Not Collective

429:   Input Parameters:
430: + sname    - The name of a new user-defined creation routine
431: - function - The creation routine

433:   Level: advanced

435: .seealso: [](sec_scatter), `VecScatter`, `VecScatterType`, `VecRegister()`
436: @*/
437: PetscErrorCode VecScatterRegister(const char sname[], PetscErrorCode (*function)(VecScatter))
438: {
439:   PetscFunctionBegin;
440:   PetscCall(PetscSFRegister(sname, function));
441:   PetscFunctionReturn(PETSC_SUCCESS);
442: }

444: /*@
445:   VecScatterGetMerged - Returns true if the scatter is completed in the `VecScatterBegin()`
446:   and the `VecScatterEnd()` does nothing

448:   Not Collective

450:   Input Parameter:
451: . sf - scatter context created with `VecScatterCreate()`

453:   Output Parameter:
454: . flg - `PETSC_TRUE` if the `VecScatterBegin()`/`VecScatterEnd()` are all done during the `VecScatterBegin()`

456:   Level: developer

458: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterEnd()`, `VecScatterBegin()`
459: @*/
460: PetscErrorCode VecScatterGetMerged(VecScatter sf, PetscBool *flg)
461: {
462:   PetscFunctionBegin;
464:   if (flg) *flg = sf->vscat.beginandendtogether;
465:   PetscFunctionReturn(PETSC_SUCCESS);
466: }
467: /*@
468:   VecScatterDestroy - Destroys a scatter context created by `VecScatterCreate()`

470:   Collective

472:   Input Parameter:
473: . sf - the scatter context

475:   Level: intermediate

477: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterCopy()`
478: @*/
479: PetscErrorCode VecScatterDestroy(VecScatter *sf)
480: {
481:   PetscFunctionBegin;
482:   PetscCall(PetscSFDestroy(sf));
483:   PetscFunctionReturn(PETSC_SUCCESS);
484: }

486: /*@
487:   VecScatterCopy - Makes a copy of a scatter context.

489:   Collective

491:   Input Parameter:
492: . sf - the scatter context

494:   Output Parameter:
495: . newsf - the context copy

497:   Level: advanced

499: .seealso: [](sec_scatter), `VecScatter`, `VecScatterType`, `VecScatterCreate()`, `VecScatterDestroy()`
500: @*/
501: PetscErrorCode VecScatterCopy(VecScatter sf, VecScatter *newsf)
502: {
503:   PetscFunctionBegin;
504:   PetscAssertPointer(newsf, 2);
505:   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_GRAPH, newsf));
506:   PetscCall(PetscSFSetUp(*newsf));
507:   PetscFunctionReturn(PETSC_SUCCESS);
508: }

510: /*@
511:   VecScatterViewFromOptions - View a `VecScatter` object based on values in the options database

513:   Collective

515:   Input Parameters:
516: + sf   - the scatter context
517: . obj  - Optional object
518: - name - command line option

520:   Level: intermediate

522:   Note:
523:   See `PetscObjectViewFromOptions()` for available `PetscViewer` and `PetscViewerFormat` values

525: .seealso: [](sec_scatter), `VecScatter`, `VecScatterView()`, `PetscObjectViewFromOptions()`, `VecScatterCreate()`
526: @*/
527: PetscErrorCode VecScatterViewFromOptions(VecScatter sf, PetscObject obj, const char name[])
528: {
529:   PetscFunctionBegin;
531:   PetscCall(PetscObjectViewFromOptions((PetscObject)sf, obj, name));
532:   PetscFunctionReturn(PETSC_SUCCESS);
533: }

535: /*@
536:   VecScatterView - Views a vector scatter context.

538:   Collective

540:   Input Parameters:
541: + sf     - the scatter context
542: - viewer - the viewer for displaying the context

544:   Level: intermediate

546: .seealso: [](sec_scatter), `VecScatter`, `PetscViewer`, `VecScatterViewFromOptions()`, `PetscObjectViewFromOptions()`, `VecScatterCreate()`
547: @*/
548: PetscErrorCode VecScatterView(VecScatter sf, PetscViewer viewer)
549: {
550:   PetscFunctionBegin;
551:   PetscCall(PetscSFView(sf, viewer));
552:   PetscFunctionReturn(PETSC_SUCCESS);
553: }

555: /*@
556:   VecScatterRemap - Remaps the "from" and "to" indices in a
557:   vector scatter context.

559:   Collective

561:   Input Parameters:
562: + sf      - vector scatter context
563: . tomap   - remapping plan for "to" indices (may be `NULL`).
564: - frommap - remapping plan for "from" indices (may be `NULL`)

566:   Level: developer

568:   Notes:
569:   In the parallel case the todata contains indices from where the data is taken
570:   (and then sent to others)! The fromdata contains indices from where the received
571:   data is finally put locally.

573:   In the sequential case the todata contains indices from where the data is put
574:   and the fromdata contains indices from where the data is taken from.
575:   This is backwards from the parallel case!

577: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`
578: @*/
579: PetscErrorCode VecScatterRemap(VecScatter sf, PetscInt tomap[], PetscInt frommap[])
580: {
581:   PetscFunctionBegin;
582:   if (tomap) PetscAssertPointer(tomap, 2);
583:   if (frommap) PetscAssertPointer(frommap, 3);
584:   PetscCall(VecScatterRemap_Internal(sf, tomap, frommap));
585:   PetscCheck(!frommap, PETSC_COMM_SELF, PETSC_ERR_SUP, "Unable to remap the FROM in scatters yet");
586:   /* Mark then vector lengths as unknown because we do not know the lengths of the remapped vectors */
587:   sf->vscat.from_n = -1;
588:   sf->vscat.to_n   = -1;
589:   PetscFunctionReturn(PETSC_SUCCESS);
590: }

592: /*@
593:   VecScatterSetFromOptions - Configures the vector scatter from values in the options database.

595:   Collective

597:   Input Parameter:
598: . sf - The vector scatter

600:   Notes:
601:   To see all options, run your program with the -help option, or consult the users manual.

603:   Must be called before `VecScatterSetUp()` and before the vector scatter is used.

605:   Level: beginner

607: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterDestroy()`, `VecScatterSetUp()`
608: @*/
609: PetscErrorCode VecScatterSetFromOptions(VecScatter sf)
610: {
611:   PetscFunctionBegin;
613:   PetscObjectOptionsBegin((PetscObject)sf);

615:   sf->vscat.beginandendtogether = PETSC_FALSE;
616:   PetscCall(PetscOptionsBool("-vecscatter_merge", "Use combined (merged) vector scatter begin and end", "VecScatterCreate", sf->vscat.beginandendtogether, &sf->vscat.beginandendtogether, NULL));
617:   if (sf->vscat.beginandendtogether) PetscCall(PetscInfo(sf, "Using combined (merged) vector scatter begin and end\n"));
618:   PetscOptionsEnd();
619:   PetscFunctionReturn(PETSC_SUCCESS);
620: }

622: /*@
623:   VecScatterCreate - Creates a vector scatter context.

625:   Collective

627:   Input Parameters:
628: + x  - a vector that defines the shape (parallel data layout of the vector) of vectors from
629:        which we scatter
630: . y  - a vector that defines the shape (parallel data layout of the vector) of vectors to which
631:        we scatter
632: . ix - the indices of xin to scatter (if `NULL` scatters all values)
633: - iy - the indices of yin to hold results (if `NULL` fills entire vector `yin` in order)

635:   Output Parameter:
636: . newsf - location to store the new scatter context

638:   Options Database Keys:
639: + -vecscatter_view              - Prints detail of communications
640: . -vecscatter_view ::ascii_info - Print less details about communication
641: - -vecscatter_merge             - `VecScatterBegin()` handles all of the communication, `VecScatterEnd()` is a nop
642:                               eliminates the chance for overlap of computation and communication

644:   Level: intermediate

646:   Notes:
647:   If both `xin` and `yin` are parallel, their communicator must be on the same
648:   set of processes, but their process order can be different.
649:   In calls to the scatter options you can use different vectors than the `xin` and
650:   `yin` you used above; BUT they must have the same parallel data layout, for example,
651:   they could be obtained from `VecDuplicate()`.
652:   A `VecScatter` context CANNOT be used in two or more simultaneous scatters;
653:   that is you cannot call a second `VecScatterBegin()` with the same scatter
654:   context until the `VecScatterEnd()` has been called on the first `VecScatterBegin()`.
655:   In this case a separate `VecScatter` is needed for each concurrent scatter.

657:   Both `ix` and `iy` cannot be `NULL` at the same time.

659:   Use `VecScatterCreateToAll()` to create a vecscatter that copies an MPI vector to sequential vectors on all MPI ranks.
660:   Use `VecScatterCreateToZero()` to create a vecscatter that copies an MPI vector to a sequential vector on MPI rank 0.
661:   These special vecscatters have better performance than general ones.

663: .seealso: [](sec_scatter), `VecScatter`, `VecScatterDestroy()`, `VecScatterCreateToAll()`, `VecScatterCreateToZero()`, `PetscSFCreate()`
664: @*/
665: PetscErrorCode VecScatterCreate(Vec x, IS ix, Vec y, IS iy, VecScatter *newsf)
666: {
667:   MPI_Comm        xcomm, ycomm, bigcomm;
668:   Vec             xx, yy;
669:   IS              ix_old = ix, iy_old = iy, ixx, iyy;
670:   PetscMPIInt     xcommsize, ycommsize, rank, result;
671:   PetscInt        i, n, N, nroots, nleaves, *ilocal, xstart, ystart, ixsize, iysize, xlen, ylen;
672:   const PetscInt *xindices, *yindices;
673:   PetscSFNode    *iremote;
674:   PetscLayout     xlayout, ylayout;
675:   ISTypeID        ixid, iyid;
676:   PetscInt        bs, bsx, bsy, min, max, m[2], mg[2], ixfirst, ixstep, iyfirst, iystep;
677:   PetscBool       can_do_block_opt = PETSC_FALSE;
678:   PetscSF         sf;

680:   PetscFunctionBegin;
681:   PetscAssertPointer(newsf, 5);
682:   PetscCheck(ix || iy, PetscObjectComm((PetscObject)x), PETSC_ERR_SUP, "Cannot pass default in for both input and output indices");

684:   /* Get comm from x and y */
685:   PetscCall(PetscObjectGetComm((PetscObject)x, &xcomm));
686:   PetscCallMPI(MPI_Comm_size(xcomm, &xcommsize));
687:   PetscCall(PetscObjectGetComm((PetscObject)y, &ycomm));
688:   PetscCallMPI(MPI_Comm_size(ycomm, &ycommsize));
689:   if (xcommsize > 1 && ycommsize > 1) {
690:     PetscCallMPI(MPI_Comm_compare(xcomm, ycomm, &result));
691:     PetscCheck(result != MPI_UNEQUAL, PETSC_COMM_SELF, PETSC_ERR_ARG_NOTSAMECOMM, "VecScatterCreate: parallel vectors x and y must have identical/congruent/similar communicators");
692:   }
693:   bs = 1; /* default, no blocking */

695:   /*
696:    Let P and S stand for parallel and sequential vectors respectively. There are four combinations of vecscatters: PtoP, PtoS,
697:    StoP and StoS. The assumption of VecScatterCreate(Vec x,IS ix,Vec y,IS iy,VecScatter *newctx) is: if x is parallel, then ix
698:    contains global indices of x. If x is sequential, ix contains local indices of x. Similarly for y and iy.

700:    SF builds around concepts of local leaves and remote roots. We treat source vector x as roots and destination vector y as
701:    leaves. A PtoS scatter can be naturally mapped to SF. We transform PtoP and StoP to PtoS, and treat StoS as trivial PtoS.
702:   */

704:   /* NULL ix or iy in VecScatterCreate(x,ix,y,iy,newctx) has special meaning. Recover them for these cases */
705:   if (!ix) {
706:     if (xcommsize > 1 && ycommsize == 1) { /* PtoS: null ix means the whole x will be scattered to each seq y */
707:       PetscCall(VecGetSize(x, &N));
708:       PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &ix));
709:     } else { /* PtoP, StoP or StoS: null ix means the whole local part of x will be scattered */
710:       PetscCall(VecGetLocalSize(x, &n));
711:       PetscCall(VecGetOwnershipRange(x, &xstart, NULL));
712:       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, xstart, 1, &ix));
713:     }
714:   }

716:   if (!iy) {
717:     if (xcommsize == 1 && ycommsize > 1) { /* StoP: null iy means the whole y will be scattered to from each seq x */
718:       PetscCall(VecGetSize(y, &N));
719:       PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iy));
720:     } else { /* PtoP, StoP or StoS: null iy means the whole local part of y will be scattered to */
721:       PetscCall(VecGetLocalSize(y, &n));
722:       PetscCall(VecGetOwnershipRange(y, &ystart, NULL));
723:       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, ystart, 1, &iy));
724:     }
725:   }

727:   /* Do error checking immediately after we have non-empty ix, iy */
728:   PetscCall(ISGetLocalSize(ix, &ixsize));
729:   PetscCall(ISGetLocalSize(iy, &iysize));
730:   PetscCall(VecGetSize(x, &xlen));
731:   PetscCall(VecGetSize(y, &ylen));
732:   PetscCheck(ixsize == iysize, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Scatter sizes of ix and iy don't match locally ix=%" PetscInt_FMT " iy=%" PetscInt_FMT, ixsize, iysize);
733:   PetscCall(ISGetMinMax(ix, &min, &max));
734:   PetscCheck(min >= 0 && max < xlen, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Scatter indices in ix are out of range: found [%" PetscInt_FMT ",%" PetscInt_FMT "), expected in [0,%" PetscInt_FMT ")", min, max, xlen);
735:   PetscCall(ISGetMinMax(iy, &min, &max));
736:   PetscCheck(min >= 0 && max < ylen, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Scatter indices in iy are out of range: found [%" PetscInt_FMT ",%" PetscInt_FMT "), expected in [0,%" PetscInt_FMT ")", min, max, ylen);

738:   /* Extract info about ix, iy for further test */
739:   PetscCall(ISGetTypeID_Private(ix, &ixid));
740:   PetscCall(ISGetTypeID_Private(iy, &iyid));
741:   if (ixid == IS_BLOCK) PetscCall(ISGetBlockSize(ix, &bsx));
742:   else if (ixid == IS_STRIDE) PetscCall(ISStrideGetInfo(ix, &ixfirst, &ixstep));

744:   if (iyid == IS_BLOCK) PetscCall(ISGetBlockSize(iy, &bsy));
745:   else if (iyid == IS_STRIDE) PetscCall(ISStrideGetInfo(iy, &iyfirst, &iystep));

747:   /* Check if a PtoS is special ToAll/ToZero scatters, which can be results of VecScatterCreateToAll/Zero.
748:      ToAll means a whole MPI vector is copied to a seq vector on every process. ToZero means a whole MPI
749:      vector is copied to a seq vector on rank 0 and other processes do nothing(i.e.,they input empty ix,iy).

751:      We can optimize these scatters with MPI collectives. We can also avoid costly analysis used for general scatters.
752:   */
753:   if (xcommsize > 1 && ycommsize == 1) { /* Ranks do not diverge at this if-test */
754:     PetscInt    pattern[2] = {0, 0};     /* A boolean array with pattern[0] for allgather-like (ToAll) and pattern[1] for gather-like (ToZero) */
755:     PetscLayout map;

757:     PetscCallMPI(MPI_Comm_rank(xcomm, &rank));
758:     PetscCall(VecGetLayout(x, &map));
759:     if (rank == 0) {
760:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
761:         /* Rank 0 scatters the whole mpi x to seq y, so it is either a ToAll or a ToZero candidate in its view */
762:         pattern[0] = pattern[1] = 1;
763:       }
764:     } else {
765:       if (ixid == IS_STRIDE && iyid == IS_STRIDE && ixsize == xlen && ixfirst == 0 && ixstep == 1 && iyfirst == 0 && iystep == 1) {
766:         /* Other ranks also scatter the whole mpi x to seq y, so it is a ToAll candidate in their view */
767:         pattern[0] = 1;
768:       } else if (ixsize == 0) {
769:         /* Other ranks do nothing, so it is a ToZero candidate */
770:         pattern[1] = 1;
771:       }
772:     }

774:     /* One stone (the expensive allreduce) two birds: pattern[] tells if it is ToAll or ToZero */
775:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, pattern, 2, MPIU_INT, MPI_LAND, xcomm));

777:     if (pattern[0] || pattern[1]) {
778:       PetscCall(PetscSFCreate(xcomm, &sf));
779:       PetscCall(PetscSFSetFromOptions(sf));
780:       PetscCall(PetscSFSetGraphWithPattern(sf, map, pattern[0] ? PETSCSF_PATTERN_ALLGATHER : PETSCSF_PATTERN_GATHER));
781:       goto functionend; /* No further analysis needed. What a big win! */
782:     }
783:   }

785:   /* Continue ...
786:      Do block optimization by taking advantage of high level info available in ix, iy.
787:      The block optimization is valid when all of the following conditions are met:
788:      1) ix, iy are blocked or can be blocked (i.e., strided with step=1);
789:      2) ix, iy have the same block size;
790:      3) all processors agree on one block size;
791:      4) no blocks span more than one process;
792:    */
793:   bigcomm = (xcommsize == 1) ? ycomm : xcomm;

795:   /* Processors could go through different path in this if-else test */
796:   m[0] = PETSC_INT_MAX;
797:   m[1] = PETSC_INT_MIN;
798:   if (ixid == IS_BLOCK && iyid == IS_BLOCK) {
799:     m[0] = PetscMin(bsx, bsy);
800:     m[1] = PetscMax(bsx, bsy);
801:   } else if (ixid == IS_BLOCK && iyid == IS_STRIDE && iystep == 1 && iyfirst % bsx == 0) {
802:     m[0] = bsx;
803:     m[1] = bsx;
804:   } else if (ixid == IS_STRIDE && iyid == IS_BLOCK && ixstep == 1 && ixfirst % bsy == 0) {
805:     m[0] = bsy;
806:     m[1] = bsy;
807:   }
808:   /* Get max and min of bsx,bsy over all processes in one allreduce */
809:   PetscCall(PetscGlobalMinMaxInt(bigcomm, m, mg));

811:   /* Since we used allreduce above, all ranks will have the same min and max. min==max
812:      implies all ranks have the same bs. Do further test to see if local vectors are dividable
813:      by bs on ALL ranks. If they are, we are ensured that no blocks span more than one processor.
814:    */
815:   if (mg[0] == mg[1] && mg[0] > 1) {
816:     PetscCall(VecGetLocalSize(x, &xlen));
817:     PetscCall(VecGetLocalSize(y, &ylen));
818:     m[0] = xlen % mg[0];
819:     m[1] = ylen % mg[0];
820:     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, m, 2, MPIU_INT, MPI_LOR, bigcomm));
821:     if (!m[0] && !m[1]) can_do_block_opt = PETSC_TRUE;
822:   }

824:   /* If can_do_block_opt, then shrink x, y, ix and iy by bs to get xx, yy, ixx and iyy, whose indices
825:      and layout are actually used in building SF. Suppose blocked ix representing {0,1,2,6,7,8} has
826:      indices {0,2} and bs=3, then ixx = {0,2}; suppose strided iy={3,4,5,6,7,8}, then iyy={1,2}.

828:      xx is a little special. If x is seq, then xx is the concatenation of seq x's on ycomm. In this way,
829:      we can treat PtoP and StoP uniformly as PtoS.
830:    */
831:   if (can_do_block_opt) {
832:     const PetscInt *indices;

834:     /* Shrink x and ix */
835:     bs = mg[0];
836:     PetscCall(VecCreateMPIWithArray(bigcomm, 1, xlen / bs, PETSC_DECIDE, NULL, &xx)); /* We only care xx's layout */
837:     if (ixid == IS_BLOCK) {
838:       PetscCall(ISBlockGetIndices(ix, &indices));
839:       PetscCall(ISBlockGetLocalSize(ix, &ixsize));
840:       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ixsize, indices, PETSC_COPY_VALUES, &ixx));
841:       PetscCall(ISBlockRestoreIndices(ix, &indices));
842:     } else { /* ixid == IS_STRIDE */
843:       PetscCall(ISGetLocalSize(ix, &ixsize));
844:       PetscCall(ISCreateStride(PETSC_COMM_SELF, ixsize / bs, ixfirst / bs, 1, &ixx));
845:     }

847:     /* Shrink y and iy */
848:     PetscCall(VecCreateMPIWithArray(ycomm, 1, ylen / bs, PETSC_DECIDE, NULL, &yy));
849:     if (iyid == IS_BLOCK) {
850:       PetscCall(ISBlockGetIndices(iy, &indices));
851:       PetscCall(ISBlockGetLocalSize(iy, &iysize));
852:       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, iysize, indices, PETSC_COPY_VALUES, &iyy));
853:       PetscCall(ISBlockRestoreIndices(iy, &indices));
854:     } else { /* iyid == IS_STRIDE */
855:       PetscCall(ISGetLocalSize(iy, &iysize));
856:       PetscCall(ISCreateStride(PETSC_COMM_SELF, iysize / bs, iyfirst / bs, 1, &iyy));
857:     }
858:   } else {
859:     ixx = ix;
860:     iyy = iy;
861:     yy  = y;
862:     if (xcommsize == 1) PetscCall(VecCreateMPIWithArray(bigcomm, 1, xlen, PETSC_DECIDE, NULL, &xx));
863:     else xx = x;
864:   }

866:   /* Now it is ready to build SF with preprocessed (xx, yy) and (ixx, iyy) */
867:   PetscCall(ISGetIndices(ixx, &xindices));
868:   PetscCall(ISGetIndices(iyy, &yindices));
869:   PetscCall(VecGetLayout(xx, &xlayout));

871:   if (ycommsize > 1) {
872:     /* PtoP or StoP */

874:     /* Below is a piece of complex code with a very simple goal: move global index pairs (xindices[i], yindices[i]),
875:        to owner process of yindices[i] according to ylayout, i = 0..n.

877:        I did it through a temp sf, but later I thought the old design was inefficient and also distorted log view.
878:        We want to map one VecScatterCreate() call to one PetscSFCreate() call. The old design mapped to three
879:        PetscSFCreate() calls. This code is on critical path of VecScatterSetUp and is used by every VecScatterCreate.
880:        So I commented it out and did another optimized implementation. The commented code is left here for reference.
881:      */
882: #if 0
883:     const PetscInt *degree;
884:     PetscSF        tmpsf;
885:     PetscInt       inedges=0,*leafdata,*rootdata;

887:     PetscCall(VecGetOwnershipRange(xx,&xstart,NULL));
888:     PetscCall(VecGetLayout(yy,&ylayout));
889:     PetscCall(VecGetOwnershipRange(yy,&ystart,NULL));

891:     PetscCall(VecGetLocalSize(yy,&nroots));
892:     PetscCall(ISGetLocalSize(iyy,&nleaves));
893:     PetscCall(PetscMalloc2(nleaves,&iremote,nleaves*2,&leafdata));

895:     for (i=0; i<nleaves; i++) {
896:       PetscCall(PetscLayoutFindOwnerIndex(ylayout,yindices[i],&iremote[i].rank,&iremote[i].index));
897:       leafdata[2*i]   = yindices[i];
898:       leafdata[2*i+1] = (xcommsize > 1)? xindices[i] : xindices[i] + xstart;
899:     }

901:     PetscCall(PetscSFCreate(ycomm,&tmpsf));
902:     PetscCall(PetscSFSetGraph(tmpsf,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER));

904:     PetscCall(PetscSFComputeDegreeBegin(tmpsf,&degree));
905:     PetscCall(PetscSFComputeDegreeEnd(tmpsf,&degree));

907:     for (i=0; i<nroots; i++) inedges += degree[i];
908:     PetscCall(PetscMalloc1(inedges*2,&rootdata));
909:     PetscCall(PetscSFGatherBegin(tmpsf,MPIU_2INT,leafdata,rootdata));
910:     PetscCall(PetscSFGatherEnd(tmpsf,MPIU_2INT,leafdata,rootdata));

912:     PetscCall(PetscFree2(iremote,leafdata));
913:     PetscCall(PetscSFDestroy(&tmpsf));

915:     /* rootdata contains global index pairs (i, j). j's are owned by the current process, but i's can point to anywhere.
916:        We convert j to local, and convert i to (rank, index). In the end, we get an PtoS suitable for building SF.
917:      */
918:     nleaves = inedges;
919:     PetscCall(VecGetLocalSize(xx,&nroots));
920:     PetscCall(PetscMalloc1(nleaves,&ilocal));
921:     PetscCall(PetscMalloc1(nleaves,&iremote));

923:     for (i=0; i<inedges; i++) {
924:       ilocal[i] = rootdata[2*i] - ystart; /* convert y's global index to local index */
925:       PetscCall(PetscLayoutFindOwnerIndex(xlayout,rootdata[2*i+1],&iremote[i].rank,&iremote[i].index)); /* convert x's global index to (rank, index) */
926:     }
927:     PetscCall(PetscFree(rootdata));
928: #else
929:     PetscInt        j, k, n, disp, rlentotal, *sstart, *xindices_sorted, *yindices_sorted;
930:     const PetscInt *yrange;
931:     PetscMPIInt     nsend, nrecv, nreq, yrank, *sendto, *recvfrom, tag1, tag2;
932:     PetscInt       *slens, *rlens, count;
933:     PetscInt       *rxindices, *ryindices;
934:     MPI_Request    *reqs, *sreqs, *rreqs;

936:     /* Sorting makes code simpler, faster and also helps getting rid of many O(P) arrays, which hurt scalability at large scale
937:        yindices_sorted - sorted yindices
938:        xindices_sorted - xindices sorted along with yindces
939:      */
940:     PetscCall(ISGetLocalSize(ixx, &n)); /*ixx, iyy have the same local size */
941:     PetscCall(PetscMalloc2(n, &xindices_sorted, n, &yindices_sorted));
942:     PetscCall(PetscArraycpy(xindices_sorted, xindices, n));
943:     PetscCall(PetscArraycpy(yindices_sorted, yindices, n));
944:     PetscCall(PetscSortIntWithArray(n, yindices_sorted, xindices_sorted));
945:     PetscCall(VecGetOwnershipRange(xx, &xstart, NULL));
946:     if (xcommsize == 1) {
947:       for (i = 0; i < n; i++) xindices_sorted[i] += xstart;
948:     } /* Convert to global indices */

950:     /*
951:              Calculate info about messages I need to send
952:        nsend    - number of non-empty messages to send
953:        sendto   - [nsend] ranks I will send messages to
954:        sstart   - [nsend+1] sstart[i] is the start index in xsindices_sorted[] I send to rank sendto[i]
955:        slens    - [ycommsize] I want to send slens[i] entries to rank i.
956:      */
957:     PetscCall(VecGetLayout(yy, &ylayout));
958:     PetscCall(PetscLayoutGetRanges(ylayout, &yrange));
959:     PetscCall(PetscCalloc1(ycommsize, &slens)); /* The only O(P) array in this algorithm */

961:     i = j = nsend = 0;
962:     while (i < n) {
963:       if (yindices_sorted[i] >= yrange[j + 1]) { /* If i-th index is out of rank j's bound */
964:         do {
965:           j++;
966:         } while (yindices_sorted[i] >= yrange[j + 1] && j < ycommsize); /* Increase j until i-th index falls in rank j's bound */
967:         PetscCheck(j != ycommsize, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " not owned by any process, upper bound %" PetscInt_FMT, yindices_sorted[i], yrange[ycommsize]);
968:       }
969:       i++;
970:       if (!slens[j]++) nsend++;
971:     }

973:     PetscCall(PetscMalloc2(nsend + 1, &sstart, nsend, &sendto));

975:     sstart[0] = 0;
976:     for (i = j = 0; i < ycommsize; i++) {
977:       if (slens[i]) {
978:         sendto[j]     = (PetscMPIInt)i;
979:         sstart[j + 1] = sstart[j] + slens[i];
980:         j++;
981:       }
982:     }

984:     /*
985:       Calculate the reverse info about messages I will recv
986:        nrecv     - number of messages I will recv
987:        recvfrom  - [nrecv] ranks I recv from
988:        rlens     - [nrecv] I will recv rlens[i] entries from rank recvfrom[i]
989:        rlentotal - sum of rlens[]
990:        rxindices - [rlentotal] recv buffer for xindices_sorted
991:        ryindices - [rlentotal] recv buffer for yindices_sorted
992:      */
993:     PetscCall(PetscGatherNumberOfMessages_Private(ycomm, NULL, slens, &nrecv));
994:     PetscCall(PetscGatherMessageLengths_Private(ycomm, nsend, nrecv, slens, &recvfrom, &rlens));
995:     PetscCall(PetscFree(slens)); /* Free the O(P) array ASAP */
996:     rlentotal = 0;
997:     for (i = 0; i < nrecv; i++) rlentotal += rlens[i];

999:     /*
1000:       Communicate with processors in recvfrom[] to populate rxindices and ryindices
1001:     */
1002:     PetscCall(PetscCommGetNewTag(ycomm, &tag1));
1003:     PetscCall(PetscCommGetNewTag(ycomm, &tag2));
1004:     PetscCall(PetscMalloc2(rlentotal, &rxindices, rlentotal, &ryindices));
1005:     PetscCall(PetscMPIIntCast((nsend + nrecv) * 2, &nreq));
1006:     PetscCall(PetscMalloc1(nreq, &reqs));
1007:     sreqs = reqs;
1008:     rreqs = PetscSafePointerPlusOffset(reqs, nsend * 2);

1010:     for (i = disp = 0; i < nrecv; i++) {
1011:       count = rlens[i];
1012:       PetscCallMPI(MPIU_Irecv(rxindices + disp, count, MPIU_INT, recvfrom[i], tag1, ycomm, rreqs + i));
1013:       PetscCallMPI(MPIU_Irecv(ryindices + disp, count, MPIU_INT, recvfrom[i], tag2, ycomm, rreqs + nrecv + i));
1014:       disp += rlens[i];
1015:     }

1017:     for (i = 0; i < nsend; i++) {
1018:       count = sstart[i + 1] - sstart[i];
1019:       PetscCallMPI(MPIU_Isend(xindices_sorted + sstart[i], count, MPIU_INT, sendto[i], tag1, ycomm, sreqs + i));
1020:       PetscCallMPI(MPIU_Isend(yindices_sorted + sstart[i], count, MPIU_INT, sendto[i], tag2, ycomm, sreqs + nsend + i));
1021:     }
1022:     PetscCallMPI(MPI_Waitall(nreq, reqs, MPI_STATUS_IGNORE));

1024:     /* Transform VecScatter into SF */
1025:     nleaves = rlentotal;
1026:     PetscCall(PetscMalloc1(nleaves, &ilocal));
1027:     PetscCall(PetscMalloc1(nleaves, &iremote));
1028:     PetscCallMPI(MPI_Comm_rank(ycomm, &yrank));
1029:     for (i = disp = 0; i < nrecv; i++) {
1030:       for (j = 0; j < rlens[i]; j++) {
1031:         k         = disp + j;                                                                  /* k-th index pair */
1032:         ilocal[k] = ryindices[k] - yrange[yrank];                                              /* Convert y's global index to local index */
1033:         PetscCall(PetscLayoutFindOwnerIndex(xlayout, rxindices[k], &rank, &iremote[k].index)); /* Convert x's global index to (rank, index) */
1034:         iremote[k].rank = rank;
1035:       }
1036:       disp += rlens[i];
1037:     }

1039:     PetscCall(PetscFree2(sstart, sendto));
1040:     PetscCall(PetscFree(rlens));
1041:     PetscCall(PetscFree(recvfrom));
1042:     PetscCall(PetscFree(reqs));
1043:     PetscCall(PetscFree2(rxindices, ryindices));
1044:     PetscCall(PetscFree2(xindices_sorted, yindices_sorted));
1045: #endif
1046:   } else {
1047:     /* PtoS or StoS */
1048:     PetscCall(ISGetLocalSize(iyy, &nleaves));
1049:     PetscCall(PetscMalloc1(nleaves, &ilocal));
1050:     PetscCall(PetscMalloc1(nleaves, &iremote));
1051:     PetscCall(PetscArraycpy(ilocal, yindices, nleaves));
1052:     for (i = 0; i < nleaves; i++) {
1053:       PetscCall(PetscLayoutFindOwnerIndex(xlayout, xindices[i], &rank, &iremote[i].index));
1054:       iremote[i].rank = rank;
1055:     }
1056:   }

1058:   /* MUST build SF on xx's comm, which is not necessarily identical to yy's comm.
1059:      In SF's view, xx contains the roots (i.e., the remote) and iremote[].rank are ranks in xx's comm.
1060:      yy contains leaves, which are local and can be thought as part of PETSC_COMM_SELF. */
1061:   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)xx), &sf));
1062:   sf->allow_multi_leaves = PETSC_TRUE;
1063:   PetscCall(PetscSFSetFromOptions(sf));
1064:   PetscCall(VecGetLocalSize(xx, &nroots));
1065:   PetscCall(PetscSFSetGraph(sf, nroots, nleaves, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); /* Give ilocal/iremote to petsc and no need to free them here */

1067:   /* Free memory no longer needed */
1068:   PetscCall(ISRestoreIndices(ixx, &xindices));
1069:   PetscCall(ISRestoreIndices(iyy, &yindices));
1070:   if (can_do_block_opt) {
1071:     PetscCall(VecDestroy(&xx));
1072:     PetscCall(VecDestroy(&yy));
1073:     PetscCall(ISDestroy(&ixx));
1074:     PetscCall(ISDestroy(&iyy));
1075:   } else if (xcommsize == 1) {
1076:     PetscCall(VecDestroy(&xx));
1077:   }

1079: functionend:
1080:   sf->vscat.bs = bs;
1081:   if (sf->vscat.bs > 1) {
1082:     PetscCallMPI(MPI_Type_contiguous((PetscMPIInt)sf->vscat.bs, MPIU_SCALAR, &sf->vscat.unit));
1083:     PetscCallMPI(MPI_Type_commit(&sf->vscat.unit));
1084:   } else {
1085:     sf->vscat.unit = MPIU_SCALAR;
1086:   }
1087:   PetscCall(VecGetLocalSize(x, &sf->vscat.from_n));
1088:   PetscCall(VecGetLocalSize(y, &sf->vscat.to_n));
1089:   if (!ix_old) PetscCall(ISDestroy(&ix)); /* We created helper ix, iy. Free them */
1090:   if (!iy_old) PetscCall(ISDestroy(&iy));

1092:   /* Set default */
1093:   PetscCall(VecScatterSetFromOptions(sf));
1094:   PetscCall(PetscSFSetUp(sf));

1096:   *newsf = sf;
1097:   PetscFunctionReturn(PETSC_SUCCESS);
1098: }

1100: /*@
1101:   VecScatterCreateToAll - Creates a vector and a scatter context that copies all
1102:   vector values to each processor

1104:   Collective

1106:   Input Parameter:
1107: . vin - an `MPIVEC`

1109:   Output Parameters:
1110: + ctx  - scatter context
1111: - vout - output `SEQVEC` that is large enough to scatter into

1113:   Level: intermediate

1115:   Example Usage:
1116: .vb
1117:   VecScatterCreateToAll(vin, &ctx, &vout);

1119:   // scatter as many times as you need
1120:   VecScatterBegin(ctx, vin, vout, INSERT_VALUES, SCATTER_FORWARD);
1121:   VecScatterEnd(ctx, vin, vout, INSERT_VALUES, SCATTER_FORWARD);

1123:   // destroy scatter context and local vector when no longer needed
1124:   VecScatterDestroy(&ctx);
1125:   VecDestroy(&vout);
1126: .ve

1128:   Notes:
1129:   `vout` may be `NULL` [`PETSC_NULL_VEC` from Fortran] if you do not
1130:   need to have it created

1132:   Do NOT create a vector and then pass it in as the final argument `vout`! `vout` is created by this routine
1133:   automatically (unless you pass `NULL` in for that argument if you do not need it).

1135: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterCreateToZero()`, `VecScatterBegin()`, `VecScatterEnd()`
1136: @*/
1137: PetscErrorCode VecScatterCreateToAll(Vec vin, VecScatter *ctx, Vec *vout)
1138: {
1139:   PetscInt  N;
1140:   IS        is;
1141:   Vec       tmp;
1142:   Vec      *tmpv;
1143:   PetscBool tmpvout = PETSC_FALSE;
1144:   VecType   roottype;

1146:   PetscFunctionBegin;
1149:   PetscAssertPointer(ctx, 2);
1150:   if (vout) {
1151:     PetscAssertPointer(vout, 3);
1152:     tmpv = vout;
1153:   } else {
1154:     tmpvout = PETSC_TRUE;
1155:     tmpv    = &tmp;
1156:   }

1158:   /* Create seq vec on each proc, with the same size of the original vec */
1159:   PetscCall(VecGetSize(vin, &N));
1160:   PetscCall(VecGetRootType_Private(vin, &roottype));
1161:   PetscCall(VecCreate(PETSC_COMM_SELF, tmpv));
1162:   PetscCall(VecSetSizes(*tmpv, N, PETSC_DECIDE));
1163:   PetscCall(VecSetType(*tmpv, roottype));
1164:   /* Create the VecScatter ctx with the communication info */
1165:   PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &is));
1166:   PetscCall(VecScatterCreate(vin, is, *tmpv, is, ctx));
1167:   PetscCall(ISDestroy(&is));
1168:   if (tmpvout) PetscCall(VecDestroy(tmpv));
1169:   PetscFunctionReturn(PETSC_SUCCESS);
1170: }

1172: /*@
1173:   VecScatterCreateToZero - Creates an output vector and a scatter context used to
1174:   copy all vector values into the output vector on the zeroth processor

1176:   Collective

1178:   Input Parameter:
1179: . vin - `Vec` of type `MPIVEC`

1181:   Output Parameters:
1182: + ctx  - scatter context
1183: - vout - output `SEQVEC` that is large enough to scatter into on processor 0 and
1184:           of length zero on all other processors

1186:   Level: intermediate

1188:   Example Usage:
1189: .vb
1190:   VecScatterCreateToZero(vin, &ctx, &vout);

1192:   // scatter as many times as you need
1193:   VecScatterBegin(ctx, vin, vout, INSERT_VALUES, SCATTER_FORWARD);
1194:   VecScatterEnd(ctx, vin, vout, INSERT_VALUES, SCATTER_FORWARD);

1196:   // destroy scatter context and local vector when no longer needed
1197:   VecScatterDestroy(&ctx);
1198:   VecDestroy(&vout);
1199: .ve

1201:   Notes:
1202:   vout may be `NULL` [`PETSC_NULL_VEC` from Fortran] if you do not
1203:   need to have it created

1205:   Do NOT create a vector and then pass it in as the final argument `vout`! `vout` is created by this routine
1206:   automatically (unless you pass `NULL` in for that argument if you do not need it).

1208: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterCreateToAll()`, `VecScatterBegin()`, `VecScatterEnd()`
1209: @*/
1210: PetscErrorCode VecScatterCreateToZero(Vec vin, VecScatter *ctx, Vec *vout)
1211: {
1212:   PetscInt    N;
1213:   PetscMPIInt rank;
1214:   IS          is;
1215:   Vec         tmp;
1216:   Vec        *tmpv;
1217:   PetscBool   tmpvout = PETSC_FALSE;
1218:   VecType     roottype;

1220:   PetscFunctionBegin;
1223:   PetscAssertPointer(ctx, 2);
1224:   if (vout) {
1225:     PetscAssertPointer(vout, 3);
1226:     tmpv = vout;
1227:   } else {
1228:     tmpvout = PETSC_TRUE;
1229:     tmpv    = &tmp;
1230:   }

1232:   /* Create vec on each proc, with the same size of the original vec all on process 0 */
1233:   PetscCall(VecGetSize(vin, &N));
1234:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)vin), &rank));
1235:   if (rank) N = 0;
1236:   PetscCall(VecGetRootType_Private(vin, &roottype));
1237:   PetscCall(VecCreate(PETSC_COMM_SELF, tmpv));
1238:   PetscCall(VecSetSizes(*tmpv, N, PETSC_DECIDE));
1239:   PetscCall(VecSetType(*tmpv, roottype));
1240:   /* Create the VecScatter ctx with the communication info */
1241:   PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &is));
1242:   PetscCall(VecScatterCreate(vin, is, *tmpv, is, ctx));
1243:   PetscCall(ISDestroy(&is));
1244:   if (tmpvout) PetscCall(VecDestroy(tmpv));
1245:   PetscFunctionReturn(PETSC_SUCCESS);
1246: }

1248: /*@
1249:   VecScatterBegin - Begins a generalized scatter from one vector to
1250:   another. Complete the scattering phase with `VecScatterEnd()`.

1252:   Neighbor-wise Collective

1254:   Input Parameters:
1255: + sf   - scatter context generated by `VecScatterCreate()`
1256: . x    - the vector from which we scatter
1257: . y    - the vector to which we scatter
1258: . addv - either `ADD_VALUES`, `MAX_VALUES`, `MIN_VALUES` or `INSERT_VALUES`, with `INSERT_VALUES` mode any location
1259:           not scattered to retains its old value; i.e. the vector is NOT first zeroed.
1260: - mode - the scattering mode, usually `SCATTER_FORWARD`.  The available modes are: `SCATTER_FORWARD` or `SCATTER_REVERSE`

1262:   Level: intermediate

1264:   Notes:
1265:   The vectors `x` and `y` need not be the same vectors used in the call
1266:   to `VecScatterCreate()`, but `x` must have the same parallel data layout
1267:   as that passed in as the `x` to `VecScatterCreate()`, similarly for the `y`.
1268:   Most likely they have been obtained from `VecDuplicate()`.

1270:   You cannot change the values in the input vector between the calls to `VecScatterBegin()`
1271:   and `VecScatterEnd()`.

1273:   If you use `SCATTER_REVERSE` the two arguments `x` and `y` should be reversed, from
1274:   the `SCATTER_FORWARD`.

1276:   y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1278:   This scatter is far more general than the conventional
1279:   scatter, since it can be a gather or a scatter or a combination,
1280:   depending on the indices ix and iy.  If x is a parallel vector and y
1281:   is sequential, `VecScatterBegin()` can serve to gather values to a
1282:   single processor.  Similarly, if `y` is parallel and `x` sequential, the
1283:   routine can scatter from one processor to many processors.

1285: .seealso: [](sec_scatter), `VecScatter`, `VecScatterCreate()`, `VecScatterEnd()`
1286: @*/
1287: PetscErrorCode VecScatterBegin(VecScatter sf, Vec x, Vec y, InsertMode addv, ScatterMode mode)
1288: {
1289:   PetscInt to_n, from_n;

1291:   PetscFunctionBegin;
1295:   if (PetscDefined(USE_DEBUG)) {
1296:     /*
1297:      Error checking to make sure these vectors match the vectors used
1298:      to create the vector scatter context. -1 in the from_n and to_n indicate the
1299:      vector lengths are unknown (for example with mapped scatters) and thus
1300:      no error checking is performed.
1301:      */
1302:     if (sf->vscat.from_n >= 0 && sf->vscat.to_n >= 0) {
1303:       PetscCall(VecGetLocalSize(x, &from_n));
1304:       PetscCall(VecGetLocalSize(y, &to_n));
1305:       if (mode & SCATTER_REVERSE) {
1306:         PetscCheck(to_n == sf->vscat.from_n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Vector wrong size %" PetscInt_FMT " for scatter %" PetscInt_FMT " (scatter reverse and vector to != sf from size)", to_n, sf->vscat.from_n);
1307:         PetscCheck(from_n == sf->vscat.to_n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Vector wrong size %" PetscInt_FMT " for scatter %" PetscInt_FMT " (scatter reverse and vector from != sf to size)", from_n, sf->vscat.to_n);
1308:       } else {
1309:         PetscCheck(to_n == sf->vscat.to_n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Vector wrong size %" PetscInt_FMT " for scatter %" PetscInt_FMT " (scatter forward and vector to != sf to size)", to_n, sf->vscat.to_n);
1310:         PetscCheck(from_n == sf->vscat.from_n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Vector wrong size %" PetscInt_FMT " for scatter %" PetscInt_FMT " (scatter forward and vector from != sf from size)", from_n, sf->vscat.from_n);
1311:       }
1312:     }
1313:   }

1315:   sf->vscat.logging = PETSC_TRUE;
1316:   PetscCall(PetscLogEventBegin(VEC_ScatterBegin, sf, x, y, 0));
1317:   PetscCall(VecScatterBegin_Internal(sf, x, y, addv, mode));
1318:   if (sf->vscat.beginandendtogether) PetscCall(VecScatterEnd_Internal(sf, x, y, addv, mode));
1319:   PetscCall(PetscLogEventEnd(VEC_ScatterBegin, sf, x, y, 0));
1320:   sf->vscat.logging = PETSC_FALSE;
1321:   PetscFunctionReturn(PETSC_SUCCESS);
1322: }

1324: /*@
1325:   VecScatterEnd - Ends a generalized scatter from one vector to another. Call
1326:   after first calling `VecScatterBegin()`.

1328:   Neighbor-wise Collective

1330:   Input Parameters:
1331: + sf   - scatter context generated by `VecScatterCreate()`
1332: . x    - the vector from which we scatter
1333: . y    - the vector to which we scatter
1334: . addv - one of `ADD_VALUES`, `MAX_VALUES`, `MIN_VALUES` or `INSERT_VALUES`
1335: - mode - the scattering mode, usually `SCATTER_FORWARD`.  The available modes are: `SCATTER_FORWARD`, `SCATTER_REVERSE`

1337:   Level: intermediate

1339:   Notes:
1340:   If you use `SCATTER_REVERSE` the arguments `x` and `y` should be reversed, from the `SCATTER_FORWARD`.

1342:   y[iy[i]] = x[ix[i]], for i=0,...,ni-1

1344: .seealso: [](sec_scatter), `VecScatter`, `VecScatterBegin()`, `VecScatterCreate()`
1345: @*/
1346: PetscErrorCode VecScatterEnd(VecScatter sf, Vec x, Vec y, InsertMode addv, ScatterMode mode)
1347: {
1348:   PetscFunctionBegin;
1352:   if (!sf->vscat.beginandendtogether) {
1353:     sf->vscat.logging = PETSC_TRUE;
1354:     PetscCall(PetscLogEventBegin(VEC_ScatterEnd, sf, x, y, 0));
1355:     PetscCall(VecScatterEnd_Internal(sf, x, y, addv, mode));
1356:     PetscCall(PetscLogEventEnd(VEC_ScatterEnd, sf, x, y, 0));
1357:     sf->vscat.logging = PETSC_FALSE;
1358:   }
1359:   PetscFunctionReturn(PETSC_SUCCESS);
1360: }