Actual source code: cupmatomics.hpp

  1: #pragma once

  3: /*====================================================================================*/
  4: /*                             Atomic operations on device                            */
  5: /*====================================================================================*/
  6: #include <petscdevice_cupm.h>
  7: #include <petscsystypes.h>

  9: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
 10:    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
 11:  */
 12: typedef long long int          llint;
 13: typedef unsigned long long int ullint;

 15: #if PetscDefined(USING_NVCC)
 16: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wunused-function")
 17: /*
 18:   Atomic Insert (exchange) operations

 20:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

 22:   int atomicExch(int* address, int val);
 23:   unsigned int atomicExch(unsigned int* address, unsigned int val);
 24:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
 25:   float atomicExch(float* address, float val);

 27:   reads the 32-bit or 64-bit word old located at the address in global or shared
 28:   memory and stores val back to memory at the same address. These two operations are
 29:   performed in one atomic transaction. The function returns old.

 31:   PETSc notes:

 33:   It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.

 35:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
 36:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
 37:   be atomic itself.

 39:   With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
 40:   insertion is atomic. Hope no user codes rely on that.
 41: */
 42: __device__ static double atomicExch(double *address, double val)
 43: {
 44:   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
 45: }

 47: __device__ static llint atomicExch(llint *address, llint val)
 48: {
 49:   return (llint)(atomicExch((ullint *)address, (ullint)val));
 50: }

 52: template <typename Type>
 53: struct AtomicInsert {
 54:   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
 55: };

 57:   #if defined(PETSC_HAVE_COMPLEX)
 58:     #if defined(PETSC_USE_REAL_DOUBLE)
 59: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
 60: template <>
 61: struct AtomicInsert<PetscComplex> {
 62:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
 63:   {
 64:     PetscComplex         old, *z = &old;
 65:     double              *xp = (double *)&x, *yp = (double *)&y;
 66:     AtomicInsert<double> op;
 67:     z[0] = op(xp[0], yp[0]);
 68:     z[1] = op(xp[1], yp[1]);
 69:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
 70:   }
 71: };
 72:     #elif defined(PETSC_USE_REAL_SINGLE)
 73: template <>
 74: struct AtomicInsert<PetscComplex> {
 75:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
 76:   {
 77:     double              *xp = (double *)&x, *yp = (double *)&y;
 78:     AtomicInsert<double> op;
 79:     return op(xp[0], yp[0]);
 80:   }
 81: };
 82:     #endif
 83:   #endif

 85: /*
 86:   Atomic add operations

 88:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

 90:   int atomicAdd(int* address, int val);
 91:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
 92:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
 93:   float atomicAdd(float* address, float val);
 94:   double atomicAdd(double* address, double val);
 95:   __half2 atomicAdd(__half2 *address, __half2 val);
 96:   __half atomicAdd(__half *address, __half val);

 98:   reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
 99:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
100:   function returns old.

102:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
103:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
104:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
105:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
106:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
107:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
108: */
109: __device__ static llint atomicAdd(llint *address, llint val)
110: {
111:   return (llint)atomicAdd((ullint *)address, (ullint)val);
112: }

114: template <typename Type>
115: struct AtomicAdd {
116:   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
117: };

119: template <>
120: struct AtomicAdd<double> {
121:   __device__ double operator()(double &x, double y) const
122:   {
123:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
124:     return atomicAdd(&x, y);
125:   #else
126:     double *address = &x, val = y;
127:     ullint *address_as_ull = (ullint *)address;
128:     ullint  old            = *address_as_ull, assumed;
129:     do {
130:       assumed = old;
131:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
132:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
133:     } while (assumed != old);
134:     return __longlong_as_double(old);
135:   #endif
136:   }
137: };

139: template <>
140: struct AtomicAdd<float> {
141:   __device__ float operator()(float &x, float y) const
142:   {
143:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
144:     return atomicAdd(&x, y);
145:   #else
146:     float *address = &x, val = y;
147:     int   *address_as_int = (int *)address;
148:     int    old            = *address_as_int, assumed;
149:     do {
150:       assumed = old;
151:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
152:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
153:     } while (assumed != old);
154:     return __int_as_float(old);
155:   #endif
156:   }
157: };

159:   #if defined(PETSC_HAVE_COMPLEX)
160: template <>
161: struct AtomicAdd<PetscComplex> {
162:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
163:   {
164:     PetscComplex         old, *z = &old;
165:     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
166:     AtomicAdd<PetscReal> op;
167:     z[0] = op(xp[0], yp[0]);
168:     z[1] = op(xp[1], yp[1]);
169:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
170:   }
171: };
172:   #endif

174:   /*
175:   Atomic Mult operations:

177:   CUDA has no atomicMult at all, so we build our own with atomicCAS
178:  */
179:   #if defined(PETSC_USE_REAL_DOUBLE)
180: __device__ static double atomicMult(double *address, double val)
181: {
182:   ullint *address_as_ull = (ullint *)(address);
183:   ullint  old            = *address_as_ull, assumed;
184:   do {
185:     assumed = old;
186:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
187:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
188:   } while (assumed != old);
189:   return __longlong_as_double(old);
190: }
191:   #elif defined(PETSC_USE_REAL_SINGLE)
192: __device__ static float atomicMult(float *address, float val)
193: {
194:   int *address_as_int = (int *)(address);
195:   int  old            = *address_as_int, assumed;
196:   do {
197:     assumed = old;
198:     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
199:   } while (assumed != old);
200:   return __int_as_float(old);
201: }
202:   #endif

204: __device__ static int atomicMult(int *address, int val)
205: {
206:   int *address_as_int = (int *)(address);
207:   int  old            = *address_as_int, assumed;
208:   do {
209:     assumed = old;
210:     old     = atomicCAS(address_as_int, assumed, val * assumed);
211:   } while (assumed != old);
212:   return (int)old;
213: }

215: __device__ static llint atomicMult(llint *address, llint val)
216: {
217:   ullint *address_as_ull = (ullint *)(address);
218:   ullint  old            = *address_as_ull, assumed;
219:   do {
220:     assumed = old;
221:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
222:   } while (assumed != old);
223:   return (llint)old;
224: }

226: template <typename Type>
227: struct AtomicMult {
228:   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
229: };

231: /*
232:   Atomic Min/Max operations

234:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

236:   int atomicMin(int* address, int val);
237:   unsigned int atomicMin(unsigned int* address,unsigned int val);
238:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

240:   reads the 32-bit or 64-bit word old located at the address in global or shared
241:   memory, computes the minimum of old and val, and stores the result back to memory
242:   at the same address. These three operations are performed in one atomic transaction.
243:   The function returns old.
244:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

246:   atomicMax() is similar.
247:  */

249:   #if defined(PETSC_USE_REAL_DOUBLE)
250: __device__ static double atomicMin(double *address, double val)
251: {
252:   ullint *address_as_ull = (ullint *)(address);
253:   ullint  old            = *address_as_ull, assumed;
254:   do {
255:     assumed = old;
256:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
257:   } while (assumed != old);
258:   return __longlong_as_double(old);
259: }

261: __device__ static double atomicMax(double *address, double val)
262: {
263:   ullint *address_as_ull = (ullint *)(address);
264:   ullint  old            = *address_as_ull, assumed;
265:   do {
266:     assumed = old;
267:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
268:   } while (assumed != old);
269:   return __longlong_as_double(old);
270: }
271:   #elif defined(PETSC_USE_REAL_SINGLE)
272: __device__ static float atomicMin(float *address, float val)
273: {
274:   int *address_as_int = (int *)(address);
275:   int  old            = *address_as_int, assumed;
276:   do {
277:     assumed = old;
278:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
279:   } while (assumed != old);
280:   return __int_as_float(old);
281: }

283: __device__ static float atomicMax(float *address, float val)
284: {
285:   int *address_as_int = (int *)(address);
286:   int  old            = *address_as_int, assumed;
287:   do {
288:     assumed = old;
289:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
290:   } while (assumed != old);
291:   return __int_as_float(old);
292: }
293:   #endif

295:   /*
296:   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
297:   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
298:   This causes compilation errors with pgi compilers and 64-bit indices:
299:       error: function "atomicMin(long long *, long long)" has already been defined

301:   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
302: */
303:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
304: __device__ static llint atomicMin(llint *address, llint val)
305: {
306:   ullint *address_as_ull = (ullint *)(address);
307:   ullint  old            = *address_as_ull, assumed;
308:   do {
309:     assumed = old;
310:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
311:   } while (assumed != old);
312:   return (llint)old;
313: }

315: __device__ static llint atomicMax(llint *address, llint val)
316: {
317:   ullint *address_as_ull = (ullint *)(address);
318:   ullint  old            = *address_as_ull, assumed;
319:   do {
320:     assumed = old;
321:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
322:   } while (assumed != old);
323:   return (llint)old;
324: }
325:   #endif

327: template <typename Type>
328: struct AtomicMin {
329:   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
330: };
331: template <typename Type>
332: struct AtomicMax {
333:   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
334: };

336: /*
337:   Atomic bitwise operations

339:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

341:   int atomicAnd(int* address, int val);
342:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
343:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

345:   reads the 32-bit or 64-bit word old located at the address in global or shared
346:   memory, computes (old & val), and stores the result back to memory at the same
347:   address. These three operations are performed in one atomic transaction.
348:   The function returns old.

350:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

352:   atomicOr() and atomicXor are similar.
353: */

355:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
356: __device__ static llint atomicAnd(llint *address, llint val)
357: {
358:   ullint *address_as_ull = (ullint *)(address);
359:   ullint  old            = *address_as_ull, assumed;
360:   do {
361:     assumed = old;
362:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
363:   } while (assumed != old);
364:   return (llint)old;
365: }
366: __device__ static llint atomicOr(llint *address, llint val)
367: {
368:   ullint *address_as_ull = (ullint *)(address);
369:   ullint  old            = *address_as_ull, assumed;
370:   do {
371:     assumed = old;
372:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
373:   } while (assumed != old);
374:   return (llint)old;
375: }

377: __device__ static llint atomicXor(llint *address, llint val)
378: {
379:   ullint *address_as_ull = (ullint *)(address);
380:   ullint  old            = *address_as_ull, assumed;
381:   do {
382:     assumed = old;
383:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
384:   } while (assumed != old);
385:   return (llint)old;
386: }
387:   #endif

389: template <typename Type>
390: struct AtomicBAND {
391:   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
392: };
393: template <typename Type>
394: struct AtomicBOR {
395:   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
396: };
397: template <typename Type>
398: struct AtomicBXOR {
399:   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
400: };

402: /*
403:   Atomic logical operations:

405:   CUDA has no atomic logical operations at all. We support them on integer types.
406: */

408: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
409:    which is what we want since we only support 32-bit and 64-bit integers.
410:  */
411: template <typename Type, class Op, int size /* sizeof(Type) */>
412: struct AtomicLogical;

414: template <typename Type, class Op>
415: struct AtomicLogical<Type, Op, 4> {
416:   __device__ Type operator()(Type &x, Type y) const
417:   {
418:     int *address_as_int = (int *)(&x);
419:     int  old            = *address_as_int, assumed;
420:     Op   op;
421:     do {
422:       assumed = old;
423:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
424:     } while (assumed != old);
425:     return (Type)old;
426:   }
427: };

429: template <typename Type, class Op>
430: struct AtomicLogical<Type, Op, 8> {
431:   __device__ Type operator()(Type &x, Type y) const
432:   {
433:     ullint *address_as_ull = (ullint *)(&x);
434:     ullint  old            = *address_as_ull, assumed;
435:     Op      op;
436:     do {
437:       assumed = old;
438:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
439:     } while (assumed != old);
440:     return (Type)old;
441:   }
442: };

444: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
445: template <typename Type>
446: struct land {
447:   __device__ Type operator()(Type x, Type y) { return x && y; }
448: };
449: template <typename Type>
450: struct lor {
451:   __device__ Type operator()(Type x, Type y) { return x || y; }
452: };
453: template <typename Type>
454: struct lxor {
455:   __device__ Type operator()(Type x, Type y) { return !x != !y; }
456: };

458: template <typename Type>
459: struct AtomicLAND {
460:   __device__ Type operator()(Type &x, Type y) const
461:   {
462:     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
463:     return op(x, y);
464:   }
465: };
466: template <typename Type>
467: struct AtomicLOR {
468:   __device__ Type operator()(Type &x, Type y) const
469:   {
470:     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
471:     return op(x, y);
472:   }
473: };
474: template <typename Type>
475: struct AtomicLXOR {
476:   __device__ Type operator()(Type &x, Type y) const
477:   {
478:     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
479:     return op(x, y);
480:   }
481: };
482: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
483: #elif PetscDefined(USING_HCC)

485:   /*
486:   Atomic Insert (exchange) operations

488:   See Cuda version
489: */
490:   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
491: __device__ static double atomicExch(double *address, double val)
492: {
493:   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
494: }
495:   #endif

497: __device__ static inline llint atomicExch(llint *address, llint val)
498: {
499:   return (llint)(atomicExch((ullint *)address, (ullint)val));
500: }

502: template <typename Type>
503: struct AtomicInsert {
504:   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
505: };

507:   #if defined(PETSC_HAVE_COMPLEX)
508:     #if defined(PETSC_USE_REAL_DOUBLE)
509: template <>
510: struct AtomicInsert<PetscComplex> {
511:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
512:   {
513:     PetscComplex         old, *z = &old;
514:     double              *xp = (double *)&x, *yp = (double *)&y;
515:     AtomicInsert<double> op;
516:     z[0] = op(xp[0], yp[0]);
517:     z[1] = op(xp[1], yp[1]);
518:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
519:   }
520: };
521:     #elif defined(PETSC_USE_REAL_SINGLE)
522: template <>
523: struct AtomicInsert<PetscComplex> {
524:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
525:   {
526:     double              *xp = (double *)&x, *yp = (double *)&y;
527:     AtomicInsert<double> op;
528:     return op(xp[0], yp[0]);
529:   }
530: };
531:     #endif
532:   #endif

534: /*
535:   Atomic add operations

537: */
538: __device__ static inline llint atomicAdd(llint *address, llint val)
539: {
540:   return (llint)atomicAdd((ullint *)address, (ullint)val);
541: }

543: template <typename Type>
544: struct AtomicAdd {
545:   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
546: };

548: template <>
549: struct AtomicAdd<double> {
550:   __device__ double operator()(double &x, double y) const
551:   {
552:     /* Cuda version does more checks that may be needed */
553:     return atomicAdd(&x, y);
554:   }
555: };

557: template <>
558: struct AtomicAdd<float> {
559:   __device__ float operator()(float &x, float y) const
560:   {
561:     /* Cuda version does more checks that may be needed */
562:     return atomicAdd(&x, y);
563:   }
564: };

566:   #if defined(PETSC_HAVE_COMPLEX)
567: template <>
568: struct AtomicAdd<PetscComplex> {
569:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
570:   {
571:     PetscComplex         old, *z = &old;
572:     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
573:     AtomicAdd<PetscReal> op;
574:     z[0] = op(xp[0], yp[0]);
575:     z[1] = op(xp[1], yp[1]);
576:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
577:   }
578: };
579:   #endif

581:   /*
582:   Atomic Mult operations:

584:   HIP has no atomicMult at all, so we build our own with atomicCAS
585:  */
586:   #if defined(PETSC_USE_REAL_DOUBLE)
587: __device__ static inline double atomicMult(double *address, double val)
588: {
589:   ullint *address_as_ull = (ullint *)(address);
590:   ullint  old            = *address_as_ull, assumed;
591:   do {
592:     assumed = old;
593:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
594:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
595:   } while (assumed != old);
596:   return __longlong_as_double(old);
597: }
598:   #elif defined(PETSC_USE_REAL_SINGLE)
599: __device__ static inline float atomicMult(float *address, float val)
600: {
601:   int *address_as_int = (int *)(address);
602:   int  old            = *address_as_int, assumed;
603:   do {
604:     assumed = old;
605:     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
606:   } while (assumed != old);
607:   return __int_as_float(old);
608: }
609:   #endif

611: __device__ static inline int atomicMult(int *address, int val)
612: {
613:   int *address_as_int = (int *)(address);
614:   int  old            = *address_as_int, assumed;
615:   do {
616:     assumed = old;
617:     old     = atomicCAS(address_as_int, assumed, val * assumed);
618:   } while (assumed != old);
619:   return (int)old;
620: }

622: __device__ static inline llint atomicMult(llint *address, llint val)
623: {
624:   ullint *address_as_ull = (ullint *)(address);
625:   ullint  old            = *address_as_ull, assumed;
626:   do {
627:     assumed = old;
628:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
629:   } while (assumed != old);
630:   return (llint)old;
631: }

633: template <typename Type>
634: struct AtomicMult {
635:   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
636: };

638:   /*
639:   Atomic Min/Max operations

641:   See CUDA version for comments.
642:  */
643:   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
644:     #if defined(PETSC_USE_REAL_DOUBLE)
645: __device__ static double atomicMin(double *address, double val)
646: {
647:   ullint *address_as_ull = (ullint *)(address);
648:   ullint  old            = *address_as_ull, assumed;
649:   do {
650:     assumed = old;
651:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
652:   } while (assumed != old);
653:   return __longlong_as_double(old);
654: }

656: __device__ static double atomicMax(double *address, double val)
657: {
658:   ullint *address_as_ull = (ullint *)(address);
659:   ullint  old            = *address_as_ull, assumed;
660:   do {
661:     assumed = old;
662:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
663:   } while (assumed != old);
664:   return __longlong_as_double(old);
665: }
666:     #elif defined(PETSC_USE_REAL_SINGLE)
667: __device__ static float atomicMin(float *address, float val)
668: {
669:   int *address_as_int = (int *)(address);
670:   int  old            = *address_as_int, assumed;
671:   do {
672:     assumed = old;
673:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
674:   } while (assumed != old);
675:   return __int_as_float(old);
676: }

678: __device__ static float atomicMax(float *address, float val)
679: {
680:   int *address_as_int = (int *)(address);
681:   int  old            = *address_as_int, assumed;
682:   do {
683:     assumed = old;
684:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
685:   } while (assumed != old);
686:   return __int_as_float(old);
687: }
688:     #endif
689:   #endif

691:   #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
692: __device__ static inline llint atomicMin(llint *address, llint val)
693: {
694:   ullint *address_as_ull = (ullint *)(address);
695:   ullint  old            = *address_as_ull, assumed;
696:   do {
697:     assumed = old;
698:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
699:   } while (assumed != old);
700:   return (llint)old;
701: }

703: __device__ static inline llint atomicMax(llint *address, llint val)
704: {
705:   ullint *address_as_ull = (ullint *)(address);
706:   ullint  old            = *address_as_ull, assumed;
707:   do {
708:     assumed = old;
709:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
710:   } while (assumed != old);
711:   return (llint)old;
712: }
713:   #endif

715: template <typename Type>
716: struct AtomicMin {
717:   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
718: };
719: template <typename Type>
720: struct AtomicMax {
721:   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
722: };

724: /*
725:   Atomic bitwise operations
726:   As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
727: */

729: __device__ static inline llint atomicAnd(llint *address, llint val)
730: {
731:   ullint *address_as_ull = (ullint *)(address);
732:   ullint  old            = *address_as_ull, assumed;
733:   do {
734:     assumed = old;
735:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
736:   } while (assumed != old);
737:   return (llint)old;
738: }
739: __device__ static inline llint atomicOr(llint *address, llint val)
740: {
741:   ullint *address_as_ull = (ullint *)(address);
742:   ullint  old            = *address_as_ull, assumed;
743:   do {
744:     assumed = old;
745:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
746:   } while (assumed != old);
747:   return (llint)old;
748: }

750: __device__ static inline llint atomicXor(llint *address, llint val)
751: {
752:   ullint *address_as_ull = (ullint *)(address);
753:   ullint  old            = *address_as_ull, assumed;
754:   do {
755:     assumed = old;
756:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
757:   } while (assumed != old);
758:   return (llint)old;
759: }

761: template <typename Type>
762: struct AtomicBAND {
763:   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
764: };
765: template <typename Type>
766: struct AtomicBOR {
767:   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
768: };
769: template <typename Type>
770: struct AtomicBXOR {
771:   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
772: };

774: /*
775:   Atomic logical operations:

777:   CUDA has no atomic logical operations at all. We support them on integer types.
778: */

780: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
781:    which is what we want since we only support 32-bit and 64-bit integers.
782:  */
783: template <typename Type, class Op, int size /* sizeof(Type) */>
784: struct AtomicLogical;

786: template <typename Type, class Op>
787: struct AtomicLogical<Type, Op, 4> {
788:   __device__ Type operator()(Type &x, Type y) const
789:   {
790:     int *address_as_int = (int *)(&x);
791:     int  old            = *address_as_int, assumed;
792:     Op   op;
793:     do {
794:       assumed = old;
795:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
796:     } while (assumed != old);
797:     return (Type)old;
798:   }
799: };

801: template <typename Type, class Op>
802: struct AtomicLogical<Type, Op, 8> {
803:   __device__ Type operator()(Type &x, Type y) const
804:   {
805:     ullint *address_as_ull = (ullint *)(&x);
806:     ullint  old            = *address_as_ull, assumed;
807:     Op      op;
808:     do {
809:       assumed = old;
810:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
811:     } while (assumed != old);
812:     return (Type)old;
813:   }
814: };

816: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
817: template <typename Type>
818: struct land {
819:   __device__ Type operator()(Type x, Type y) { return x && y; }
820: };
821: template <typename Type>
822: struct lor {
823:   __device__ Type operator()(Type x, Type y) { return x || y; }
824: };
825: template <typename Type>
826: struct lxor {
827:   __device__ Type operator()(Type x, Type y) { return !x != !y; }
828: };

830: template <typename Type>
831: struct AtomicLAND {
832:   __device__ Type operator()(Type &x, Type y) const
833:   {
834:     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
835:     return op(x, y);
836:   }
837: };
838: template <typename Type>
839: struct AtomicLOR {
840:   __device__ Type operator()(Type &x, Type y) const
841:   {
842:     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
843:     return op(x, y);
844:   }
845: };
846: template <typename Type>
847: struct AtomicLXOR {
848:   __device__ Type operator()(Type &x, Type y) const
849:   {
850:     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
851:     return op(x, y);
852:   }
853: };
854: #endif