Actual source code: vecseqcupm.cu
  1: #include "../vecseqcupm.hpp" /*I <petscvec.h> I*/
  2: #include "../vecseqcupm_impl.hpp"

  4: using namespace ::Petsc::vec::cupm;
  5: using ::Petsc::device::cupm::DeviceType;

  7: template class impl::VecSeq_CUPM<DeviceType::CUDA>;

  9: static constexpr auto VecSeq_CUDA = impl::VecSeq_CUPM<DeviceType::CUDA>{};

 11: /*MC
 12:   VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

 14:   Options Database Key:
 15: . -vec_type seqcuda - sets the vector type to `VECSEQCUDA` during a call to `VecSetFromOptions()`

 17:   Level: beginner

 19: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECSEQ`,
 20: `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`, `VECCUDA`, `VECHIP`, VECMPICUDA`, `VECMPIHIP`, `VECSEQHIP`
 21: M*/

 23: PetscErrorCode VecCreate_SeqCUDA(Vec v)
 24: {
 25:   PetscFunctionBegin;
 26:   PetscCall(VecSeq_CUDA.Create(v));
 27:   PetscFunctionReturn(PETSC_SUCCESS);
 28: }

 30: PetscErrorCode VecConvert_Seq_SeqCUDA_inplace(Vec v)
 31: {
 32:   PetscFunctionBegin;
 33:   PetscCall(VecSeq_CUDA.Convert_IMPL_IMPLCUPM(v));
 34:   PetscFunctionReturn(PETSC_SUCCESS);
 35: }

 37: // PetscClangLinter pragma disable: -fdoc-internal-linkage
 38: /*@
 39:   VecCreateSeqCUDA - Creates a standard, sequential, array-style vector.

 41:   Collective, Possibly Synchronous

 43:   Input Parameters:
 44: + comm - the communicator, must be `PETSC_COMM_SELF`
 45: - n    - the vector length

 47:   Output Parameter:
 48: . v - the vector

 50:   Level: intermediate

 52:   Notes:
 53:   Use `VecDuplicate()` or `VecDuplicateVecs()` to form additional vectors of the same type as an
 54:   existing vector.

 56:   This function may initialize `PetscDevice`, which may incur a device synchronization.

 58: .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeq()`, `VecCreateSeqCUDAWithArray()`,
 59:           `VecCreateMPI()`, `VecCreateMPICUDA()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
 60: @*/
 61: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm, PetscInt n, Vec *v)
 62: {
 63:   PetscFunctionBegin;
 64:   PetscCall(VecCreateSeqCUPMAsync<DeviceType::CUDA>(comm, n, v));
 65:   PetscFunctionReturn(PETSC_SUCCESS);
 66: }

 68: // PetscClangLinter pragma disable: -fdoc-internal-linkage
 69: /*@C
 70:   VecCreateSeqCUDAWithArrays - Creates a sequential, array-style vector using CUDA, where the
 71:   user provides the complete array space to store the vector values.

 73:   Collective, Possibly Synchronous

 75:   Input Parameters:
 76: + comm     - the communicator, must be `PETSC_COMM_SELF`
 77: . bs       - the block size
 78: . n        - the local vector length
 79: . cpuarray - CPU memory where the vector elements are to be stored (or `NULL`)
 80: - gpuarray - GPU memory where the vector elements are to be stored (or `NULL`)

 82:   Output Parameter:
 83: . v - the vector

 85:   Level: intermediate

 87:   Notes:
 88:   If the user-provided array is `NULL`, then `VecCUDAPlaceArray()` can be used at a later stage to
 89:   SET the array for storing the vector values. Otherwise, the array must be allocated on the
 90:   device.

 92:   If both cpuarray and gpuarray are provided, the provided arrays must have identical
 93:   values.

 95:   The arrays are NOT freed when the vector is destroyed via `VecDestroy()`. The user must free
 96:   them themselves, but not until the vector is destroyed.

 98:   This function may initialize `PetscDevice`, which may incur a device synchronization.

100: .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeqWithArray()`, `VecCreateSeqCUDA()`,
101:           `VecCreateSeqCUDAWithArray()`, `VecCreateMPICUDA()`, `VecCreateMPICUDAWithArray()`,
102:           `VecCreateMPICUDAWithArrays()`, `VecCUDAPlaceArray()`
103: C@*/
104: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *v)
105: {
106:   PetscFunctionBegin;
107:   PetscCall(VecCreateSeqCUPMWithArraysAsync<DeviceType::CUDA>(comm, bs, n, cpuarray, gpuarray, v));
108:   PetscFunctionReturn(PETSC_SUCCESS);
109: }

111: // PetscClangLinter pragma disable: -fdoc-internal-linkage
112: /*@C
113:   VecCreateSeqCUDAWithArray - Creates a sequential, array-style vector using CUDA, where the
114:   user provides the device array space to store the vector values.

116:   Collective, Possibly Synchronous

118:   Input Parameters:
119: + comm     - the communicator, must be `PETSC_COMM_SELF`
120: . bs       - the block size
121: . n        - the vector length
122: - gpuarray - GPU memory where the vector elements are to be stored (or `NULL`)

124:   Output Parameter:
125: . v - the vector

127:   Level: intermediate

129:   Notes:
130:   If the user-provided array is `NULL`, then `VecCUDAPlaceArray()` can be used at a later stage to
131:   SET the array for storing the vector values. Otherwise, the array must be allocated on the
132:   device.

134:   The array is NOT freed when the vector is destroyed via `VecDestroy()`. The user must free the
135:   array themselves, but not until the vector is destroyed.

137:   Use `VecDuplicate()` or `VecDuplicateVecs()` to form additional vectors of the same type as an
138:   existing vector.

140:   This function may initialize `PetscDevice`, which may incur a device synchronization.

142: .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeq()`, `VecCreateSeqWithArray()`,
143:           `VecCreateMPIWithArray()`, `VecCreateSeqCUDA()`, `VecCreateMPICUDAWithArray()`, `VecCUDAPlaceArray()`,
144:           `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
145: @*/
146: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar gpuarray[], Vec *v)
147: {
148:   PetscFunctionBegin;
149:   PetscCall(VecCreateSeqCUDAWithArrays(comm, bs, n, nullptr, gpuarray, v));
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: // PetscClangLinter pragma disable: -fdoc-internal-linkage
154: /*@C
155:   VecCUDAGetArray - Provides access to the device buffer inside a vector

157:   Logically Collective; Asynchronous; No Fortran Support

159:   Input Parameter:
160: . v - the vector

162:   Output Parameter:
163: . a - the device buffer

165:   Level: intermediate

167:   Notes:
168:   This routine has semantics similar to `VecGetArray()`; the returned buffer points to a
169:   consistent view of the vector data. This may involve copying data from the host to the device
170:   if the data on the device is out of date. It is also assumed that the returned buffer is
171:   immediately modified, marking the host data out of date. This is similar to intent(inout) in
172:   Fortran.

174:   If the user does require strong memory guarantees, they are encouraged to use
175:   `VecCUDAGetArrayRead()` and/or `VecCUDAGetArrayWrite()` instead.

177:   The user must call `VecCUDARestoreArray()` when they are finished using the array.

179:   Developer Note:
180:   If the device memory hasn't been allocated previously it will be allocated as part of this
181:   routine.

183: .seealso: [](ch_vectors), `VecCUDARestoreArray()`, `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
184:           `VecGetArrayRead()`, `VecGetArrayWrite()`
185: @*/
186: PetscErrorCode VecCUDAGetArray(Vec v, PetscScalar **a)
187: {
188:   PetscFunctionBegin;
189:   PetscCall(VecCUPMGetArrayAsync<DeviceType::CUDA>(v, a));
190:   PetscFunctionReturn(PETSC_SUCCESS);
191: }

193: // PetscClangLinter pragma disable: -fdoc-internal-linkage
194: /*@C
195:   VecCUDARestoreArray - Restore a device buffer previously acquired with `VecCUDAGetArray()`.

197:   NotCollective; Asynchronous; No Fortran Support

199:   Input Parameters:
200: + v - the vector
201: - a - the device buffer

203:   Level: intermediate

205:   Note:
206:   The restored pointer is invalid after this function returns. This function also marks the
207:   host data as out of date. Subsequent access to the vector data on the host side via
208:   `VecGetArray()` will incur a (synchronous) data transfer.

210: .seealso: [](ch_vectors), `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
211:           `VecRestoreArray()`, `VecGetArrayRead()`
212: @*/
213: PetscErrorCode VecCUDARestoreArray(Vec v, PetscScalar **a)
214: {
215:   PetscFunctionBegin;
216:   PetscCall(VecCUPMRestoreArrayAsync<DeviceType::CUDA>(v, a));
217:   PetscFunctionReturn(PETSC_SUCCESS);
218: }

220: // PetscClangLinter pragma disable: -fdoc-internal-linkage
221: /*@C
222:   VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

224:   Not Collective; Asynchronous; No Fortran Support

226:   Input Parameter:
227: . v - the vector

229:   Output Parameter:
230: . a - the CUDA pointer.

232:   Level: intermediate

234:   Notes:
235:   See `VecCUDAGetArray()` for data movement semantics of this function.

237:   This function assumes that the user will not modify the vector data. This is analgogous to
238:   intent(in) in Fortran.

240:   The device pointer must be restored by calling `VecCUDARestoreArrayRead()`. If the data on the
241:   host side was previously up to date it will remain so, i.e. data on both the device and the
242:   host is up to date. Accessing data on the host side does not incur a device to host data
243:   transfer.

245: .seealso: [](ch_vectors), `VecCUDARestoreArrayRead()`, `VecCUDAGetArray()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
246:           `VecGetArrayRead()`
247: @*/
248: PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
249: {
250:   PetscFunctionBegin;
251:   PetscCall(VecCUPMGetArrayReadAsync<DeviceType::CUDA>(v, a));
252:   PetscFunctionReturn(PETSC_SUCCESS);
253: }

255: // PetscClangLinter pragma disable: -fdoc-internal-linkage
256: /*@C
257:   VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with
258:   `VecCUDAGetArrayRead()`.

260:   Not Collective; Asynchronous; No Fortran Support

262:   Input Parameters:
263: + v - the vector
264: - a - the CUDA device pointer

266:   Level: intermediate

268:   Note:
269:   This routine does not modify the corresponding array on the host in any way. The pointer is
270:   invalid after this function returns.

272: .seealso: [](ch_vectors), `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecCUDAGetArray()`, `VecGetArray()`,
273:           `VecRestoreArray()`, `VecGetArrayRead()`
274: @*/
275: PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
276: {
277:   PetscFunctionBegin;
278:   PetscCall(VecCUPMRestoreArrayReadAsync<DeviceType::CUDA>(v, a));
279:   PetscFunctionReturn(PETSC_SUCCESS);
280: }

282: // PetscClangLinter pragma disable: -fdoc-internal-linkage
283: /*@C
284:   VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

286:    Logically Collective; Asynchronous; No Fortran Support

288:   Input Parameter:
289: . v - the vector

291:   Output Parameter:
292: . a - the CUDA pointer

294:   Level: advanced

296:   Notes:
297:   The data pointed to by the device pointer is uninitialized. The user may not read from this
298:   data. Furthermore, the entire array needs to be filled by the user to obtain well-defined
299:   behaviour. The device memory will be allocated by this function if it hasn't been allocated
300:   previously. This is analogous to intent(out) in Fortran.

302:   The device pointer needs to be released with `VecCUDARestoreArrayWrite()`. When the pointer is
303:   released the host data of the vector is marked as out of data. Subsequent access of the host
304:   data with e.g. VecGetArray() incurs a device to host data transfer.

306: .seealso: [](ch_vectors), `VecCUDARestoreArrayWrite()`, `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`,
307:           `VecCUDAGetArrayWrite()`, `VecGetArray()`, `VecGetArrayRead()`
308: @*/
309: PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
310: {
311:   PetscFunctionBegin;
312:   PetscCall(VecCUPMGetArrayWriteAsync<DeviceType::CUDA>(v, a));
313:   PetscFunctionReturn(PETSC_SUCCESS);
314: }

316: // PetscClangLinter pragma disable: -fdoc-internal-linkage
317: /*@C
318:   VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with
319:   `VecCUDAGetArrayWrite()`.

321:    Logically Collective; Asynchronous; No Fortran Support

323:   Input Parameters:
324: + v - the vector
325: - a - the CUDA device pointer.  This pointer is invalid after `VecCUDARestoreArrayWrite()` returns.

327:   Level: intermediate

329:   Note:
330:   Data on the host will be marked as out of date. Subsequent access of the data on the host
331:   side e.g. with `VecGetArray()` will incur a device to host data transfer.

333: .seealso: [](ch_vectors), `VecCUDAGetArrayWrite()`, `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`,
334:           `VecCUDAGetArrayWrite()`, `VecGetArray()`, `VecRestoreArray()`, `VecGetArrayRead()`
335: @*/
336: PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
337: {
338:   PetscFunctionBegin;
339:   PetscCall(VecCUPMRestoreArrayWriteAsync<DeviceType::CUDA>(v, a));
340:   PetscFunctionReturn(PETSC_SUCCESS);
341: }

343: // PetscClangLinter pragma disable: -fdoc-internal-linkage
344: /*@C
345:   VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a GPU array provided
346:   by the user.

348:   Logically Collective; Asynchronous; No Fortran Support

350:   Input Parameters:
351: + vec - the vector
352: - array - the GPU array

354:   Level: advanced

356:   Notes:
357:   This routine is useful to avoid copying an array into a vector, though you can return to the
358:   original GPU array with a call to `VecCUDAResetArray()`.

360:   It is not possible to use `VecCUDAPlaceArray()` and `VecPlaceArray()` at the same time on the
361:   same vector.

363:   `vec` does not take ownership of `array` in any way. The user must free `array` themselves
364:   but be careful not to do so before the vector has either been destroyed, had its original
365:   array restored with `VecCUDAResetArray()` or permanently replaced with
366:   `VecCUDAReplaceArray()`.

368: .seealso: [](ch_vectors), `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`,
369:           `VecResetArray()`, `VecCUDAResetArray()`, `VecCUDAReplaceArray()`
370: @*/
371: PetscErrorCode VecCUDAPlaceArray(Vec vin, const PetscScalar a[])
372: {
373:   PetscFunctionBegin;
374:   PetscCall(VecCUPMPlaceArrayAsync<DeviceType::CUDA>(vin, a));
375:   PetscFunctionReturn(PETSC_SUCCESS);
376: }

378: // PetscClangLinter pragma disable: -fdoc-internal-linkage
379: /*@C
380:   VecCUDAReplaceArray - Permanently replace the GPU array in a vector with a GPU array provided
381:   by the user.

383:   Logically Collective; No Fortran Support

385:   Input Parameters:
386: + vec   - the vector
387: - array - the GPU array

389:   Level: advanced

391:   Notes:
392:   This is useful to avoid copying a GPU array into a vector.

394:   This frees the memory associated with the old GPU array. The vector takes ownership of the
395:   passed array so it CANNOT be freed by the user. It will be freed when the vector is
396:   destroyed.

398: .seealso: [](ch_vectors), `VecGetArray()`, `VecRestoreArray()`, `VecPlaceArray()`, `VecResetArray()`,
399:           `VecCUDAResetArray()`, `VecCUDAPlaceArray()`, `VecReplaceArray()`
400: @*/
401: PetscErrorCode VecCUDAReplaceArray(Vec vin, const PetscScalar a[])
402: {
403:   PetscFunctionBegin;
404:   PetscCall(VecCUPMReplaceArrayAsync<DeviceType::CUDA>(vin, a));
405:   PetscFunctionReturn(PETSC_SUCCESS);
406: }

408: // PetscClangLinter pragma disable: -fdoc-internal-linkage
409: /*@C
410:   VecCUDAResetArray - Resets a vector to use its default memory.

412:   Logically Collective; No Fortran Support

414:   Input Parameters:
415: . vec - the vector

417:   Level: advanced

419:   Note:
420:   Call this after the use of `VecCUDAPlaceArray()`.

422: .seealso: [](ch_vectors), `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecPlaceArray()`,
423:           `VecResetArray()`, `VecCUDAPlaceArray()`, `VecCUDAReplaceArray()`
424: @*/
425: PetscErrorCode VecCUDAResetArray(Vec vin)
426: {
427:   PetscFunctionBegin;
428:   PetscCall(VecCUPMResetArrayAsync<DeviceType::CUDA>(vin));
429:   PetscFunctionReturn(PETSC_SUCCESS);
430: }