Actual source code: cupmatomics.hpp
  1: #pragma once

  3: /*====================================================================================*/
  4: /*                             Atomic operations on device                            */
  5: /*====================================================================================*/
  6: #include <petscdevice_cupm.h>
  7: #include <petscsystypes.h>

  9: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
 10:    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
 11:  */
 12: typedef long long int          llint;
 13: typedef unsigned long long int ullint;

 15: #if PetscDefined(USING_NVCC)
 16: /*
 17:   Atomic Insert (exchange) operations

 19:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

 21:   int atomicExch(int* address, int val);
 22:   unsigned int atomicExch(unsigned int* address, unsigned int val);
 23:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
 24:   float atomicExch(float* address, float val);

 26:   reads the 32-bit or 64-bit word old located at the address in global or shared
 27:   memory and stores val back to memory at the same address. These two operations are
 28:   performed in one atomic transaction. The function returns old.

 30:   PETSc notes:

 32:   It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.

 34:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
 35:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
 36:   be atomic itself.

 38:   With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
 39:   insertion is atomic. Hope no user codes rely on that.
 40: */
 41: __device__ static double atomicExch(double *address, double val)
 42: {
 43:   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
 44: }

 46: __device__ static llint atomicExch(llint *address, llint val)
 47: {
 48:   return (llint)(atomicExch((ullint *)address, (ullint)val));
 49: }

 51: template <typename Type>
 52: struct AtomicInsert {
 53:   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
 54: };

 56:   #if defined(PETSC_HAVE_COMPLEX)
 57:     #if defined(PETSC_USE_REAL_DOUBLE)
 58: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
 59: template <>
 60: struct AtomicInsert<PetscComplex> {
 61:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
 62:   {
 63:     PetscComplex         old, *z = &old;
 64:     double              *xp = (double *)&x, *yp = (double *)&y;
 65:     AtomicInsert<double> op;
 66:     z[0] = op(xp[0], yp[0]);
 67:     z[1] = op(xp[1], yp[1]);
 68:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
 69:   }
 70: };
 71:     #elif defined(PETSC_USE_REAL_SINGLE)
 72: template <>
 73: struct AtomicInsert<PetscComplex> {
 74:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
 75:   {
 76:     double              *xp = (double *)&x, *yp = (double *)&y;
 77:     AtomicInsert<double> op;
 78:     return op(xp[0], yp[0]);
 79:   }
 80: };
 81:     #endif
 82:   #endif

 84: /*
 85:   Atomic add operations

 87:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

 89:   int atomicAdd(int* address, int val);
 90:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
 91:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
 92:   float atomicAdd(float* address, float val);
 93:   double atomicAdd(double* address, double val);
 94:   __half2 atomicAdd(__half2 *address, __half2 val);
 95:   __half atomicAdd(__half *address, __half val);

 97:   reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
 98:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
 99:   function returns old.

101:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
102:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
103:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
104:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
105:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
106:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
107: */
108: __device__ static llint atomicAdd(llint *address, llint val)
109: {
110:   return (llint)atomicAdd((ullint *)address, (ullint)val);
111: }

113: template <typename Type>
114: struct AtomicAdd {
115:   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
116: };

118: template <>
119: struct AtomicAdd<double> {
120:   __device__ double operator()(double &x, double y) const
121:   {
122:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
123:     return atomicAdd(&x, y);
124:   #else
125:     double *address = &x, val = y;
126:     ullint *address_as_ull = (ullint *)address;
127:     ullint  old            = *address_as_ull, assumed;
128:     do {
129:       assumed = old;
130:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
131:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
132:     } while (assumed != old);
133:     return __longlong_as_double(old);
134:   #endif
135:   }
136: };

138: template <>
139: struct AtomicAdd<float> {
140:   __device__ float operator()(float &x, float y) const
141:   {
142:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
143:     return atomicAdd(&x, y);
144:   #else
145:     float *address = &x, val = y;
146:     int   *address_as_int = (int *)address;
147:     int    old            = *address_as_int, assumed;
148:     do {
149:       assumed = old;
150:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
151:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
152:     } while (assumed != old);
153:     return __int_as_float(old);
154:   #endif
155:   }
156: };

158:   #if defined(PETSC_HAVE_COMPLEX)
159: template <>
160: struct AtomicAdd<PetscComplex> {
161:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
162:   {
163:     PetscComplex         old, *z = &old;
164:     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
165:     AtomicAdd<PetscReal> op;
166:     z[0] = op(xp[0], yp[0]);
167:     z[1] = op(xp[1], yp[1]);
168:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
169:   }
170: };
171:   #endif

173:   /*
174:   Atomic Mult operations:

176:   CUDA has no atomicMult at all, so we build our own with atomicCAS
177:  */
178:   #if defined(PETSC_USE_REAL_DOUBLE)
179: __device__ static double atomicMult(double *address, double val)
180: {
181:   ullint *address_as_ull = (ullint *)(address);
182:   ullint  old            = *address_as_ull, assumed;
183:   do {
184:     assumed = old;
185:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
186:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
187:   } while (assumed != old);
188:   return __longlong_as_double(old);
189: }
190:   #elif defined(PETSC_USE_REAL_SINGLE)
191: __device__ static float atomicMult(float *address, float val)
192: {
193:   int *address_as_int = (int *)(address);
194:   int  old            = *address_as_int, assumed;
195:   do {
196:     assumed = old;
197:     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
198:   } while (assumed != old);
199:   return __int_as_float(old);
200: }
201:   #endif

203: __device__ static int atomicMult(int *address, int val)
204: {
205:   int *address_as_int = (int *)(address);
206:   int  old            = *address_as_int, assumed;
207:   do {
208:     assumed = old;
209:     old     = atomicCAS(address_as_int, assumed, val * assumed);
210:   } while (assumed != old);
211:   return (int)old;
212: }

214: __device__ static llint atomicMult(llint *address, llint val)
215: {
216:   ullint *address_as_ull = (ullint *)(address);
217:   ullint  old            = *address_as_ull, assumed;
218:   do {
219:     assumed = old;
220:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
221:   } while (assumed != old);
222:   return (llint)old;
223: }

225: template <typename Type>
226: struct AtomicMult {
227:   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
228: };

230: /*
231:   Atomic Min/Max operations

233:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

235:   int atomicMin(int* address, int val);
236:   unsigned int atomicMin(unsigned int* address,unsigned int val);
237:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

239:   reads the 32-bit or 64-bit word old located at the address in global or shared
240:   memory, computes the minimum of old and val, and stores the result back to memory
241:   at the same address. These three operations are performed in one atomic transaction.
242:   The function returns old.
243:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

245:   atomicMax() is similar.
246:  */

248:   #if defined(PETSC_USE_REAL_DOUBLE)
249: __device__ static double atomicMin(double *address, double val)
250: {
251:   ullint *address_as_ull = (ullint *)(address);
252:   ullint  old            = *address_as_ull, assumed;
253:   do {
254:     assumed = old;
255:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
256:   } while (assumed != old);
257:   return __longlong_as_double(old);
258: }

260: __device__ static double atomicMax(double *address, double val)
261: {
262:   ullint *address_as_ull = (ullint *)(address);
263:   ullint  old            = *address_as_ull, assumed;
264:   do {
265:     assumed = old;
266:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
267:   } while (assumed != old);
268:   return __longlong_as_double(old);
269: }
270:   #elif defined(PETSC_USE_REAL_SINGLE)
271: __device__ static float atomicMin(float *address, float val)
272: {
273:   int *address_as_int = (int *)(address);
274:   int  old            = *address_as_int, assumed;
275:   do {
276:     assumed = old;
277:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
278:   } while (assumed != old);
279:   return __int_as_float(old);
280: }

282: __device__ static float atomicMax(float *address, float val)
283: {
284:   int *address_as_int = (int *)(address);
285:   int  old            = *address_as_int, assumed;
286:   do {
287:     assumed = old;
288:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
289:   } while (assumed != old);
290:   return __int_as_float(old);
291: }
292:   #endif

294:   /*
295:   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
296:   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
297:   This causes compilation errors with pgi compilers and 64-bit indices:
298:       error: function "atomicMin(long long *, long long)" has already been defined

300:   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
301: */
302:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
303: __device__ static llint atomicMin(llint *address, llint val)
304: {
305:   ullint *address_as_ull = (ullint *)(address);
306:   ullint  old            = *address_as_ull, assumed;
307:   do {
308:     assumed = old;
309:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
310:   } while (assumed != old);
311:   return (llint)old;
312: }

314: __device__ static llint atomicMax(llint *address, llint val)
315: {
316:   ullint *address_as_ull = (ullint *)(address);
317:   ullint  old            = *address_as_ull, assumed;
318:   do {
319:     assumed = old;
320:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
321:   } while (assumed != old);
322:   return (llint)old;
323: }
324:   #endif

326: template <typename Type>
327: struct AtomicMin {
328:   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
329: };
330: template <typename Type>
331: struct AtomicMax {
332:   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
333: };

335: /*
336:   Atomic bitwise operations

338:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

340:   int atomicAnd(int* address, int val);
341:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
342:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

344:   reads the 32-bit or 64-bit word old located at the address in global or shared
345:   memory, computes (old & val), and stores the result back to memory at the same
346:   address. These three operations are performed in one atomic transaction.
347:   The function returns old.

349:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

351:   atomicOr() and atomicXor are similar.
352: */

354:   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
355: __device__ static llint atomicAnd(llint *address, llint val)
356: {
357:   ullint *address_as_ull = (ullint *)(address);
358:   ullint  old            = *address_as_ull, assumed;
359:   do {
360:     assumed = old;
361:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
362:   } while (assumed != old);
363:   return (llint)old;
364: }
365: __device__ static llint atomicOr(llint *address, llint val)
366: {
367:   ullint *address_as_ull = (ullint *)(address);
368:   ullint  old            = *address_as_ull, assumed;
369:   do {
370:     assumed = old;
371:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
372:   } while (assumed != old);
373:   return (llint)old;
374: }

376: __device__ static llint atomicXor(llint *address, llint val)
377: {
378:   ullint *address_as_ull = (ullint *)(address);
379:   ullint  old            = *address_as_ull, assumed;
380:   do {
381:     assumed = old;
382:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
383:   } while (assumed != old);
384:   return (llint)old;
385: }
386:   #endif

388: template <typename Type>
389: struct AtomicBAND {
390:   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
391: };
392: template <typename Type>
393: struct AtomicBOR {
394:   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
395: };
396: template <typename Type>
397: struct AtomicBXOR {
398:   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
399: };

401: /*
402:   Atomic logical operations:

404:   CUDA has no atomic logical operations at all. We support them on integer types.
405: */

407: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
408:    which is what we want since we only support 32-bit and 64-bit integers.
409:  */
410: template <typename Type, class Op, int size /* sizeof(Type) */>
411: struct AtomicLogical;

413: template <typename Type, class Op>
414: struct AtomicLogical<Type, Op, 4> {
415:   __device__ Type operator()(Type &x, Type y) const
416:   {
417:     int *address_as_int = (int *)(&x);
418:     int  old            = *address_as_int, assumed;
419:     Op   op;
420:     do {
421:       assumed = old;
422:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
423:     } while (assumed != old);
424:     return (Type)old;
425:   }
426: };

428: template <typename Type, class Op>
429: struct AtomicLogical<Type, Op, 8> {
430:   __device__ Type operator()(Type &x, Type y) const
431:   {
432:     ullint *address_as_ull = (ullint *)(&x);
433:     ullint  old            = *address_as_ull, assumed;
434:     Op      op;
435:     do {
436:       assumed = old;
437:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
438:     } while (assumed != old);
439:     return (Type)old;
440:   }
441: };

443: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
444: template <typename Type>
445: struct land {
446:   __device__ Type operator()(Type x, Type y) { return x && y; }
447: };
448: template <typename Type>
449: struct lor {
450:   __device__ Type operator()(Type x, Type y) { return x || y; }
451: };
452: template <typename Type>
453: struct lxor {
454:   __device__ Type operator()(Type x, Type y) { return !x != !y; }
455: };

457: template <typename Type>
458: struct AtomicLAND {
459:   __device__ Type operator()(Type &x, Type y) const
460:   {
461:     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
462:     return op(x, y);
463:   }
464: };
465: template <typename Type>
466: struct AtomicLOR {
467:   __device__ Type operator()(Type &x, Type y) const
468:   {
469:     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
470:     return op(x, y);
471:   }
472: };
473: template <typename Type>
474: struct AtomicLXOR {
475:   __device__ Type operator()(Type &x, Type y) const
476:   {
477:     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
478:     return op(x, y);
479:   }
480: };

482: #elif PetscDefined(USING_HCC)

484:   /*
485:   Atomic Insert (exchange) operations

487:   See Cuda version
488: */
489:   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
490: __device__ static double atomicExch(double *address, double val)
491: {
492:   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
493: }
494:   #endif

496: __device__ static inline llint atomicExch(llint *address, llint val)
497: {
498:   return (llint)(atomicExch((ullint *)address, (ullint)val));
499: }

501: template <typename Type>
502: struct AtomicInsert {
503:   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
504: };

506:   #if defined(PETSC_HAVE_COMPLEX)
507:     #if defined(PETSC_USE_REAL_DOUBLE)
508: template <>
509: struct AtomicInsert<PetscComplex> {
510:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
511:   {
512:     PetscComplex         old, *z = &old;
513:     double              *xp = (double *)&x, *yp = (double *)&y;
514:     AtomicInsert<double> op;
515:     z[0] = op(xp[0], yp[0]);
516:     z[1] = op(xp[1], yp[1]);
517:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
518:   }
519: };
520:     #elif defined(PETSC_USE_REAL_SINGLE)
521: template <>
522: struct AtomicInsert<PetscComplex> {
523:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
524:   {
525:     double              *xp = (double *)&x, *yp = (double *)&y;
526:     AtomicInsert<double> op;
527:     return op(xp[0], yp[0]);
528:   }
529: };
530:     #endif
531:   #endif

533: /*
534:   Atomic add operations

536: */
537: __device__ static inline llint atomicAdd(llint *address, llint val)
538: {
539:   return (llint)atomicAdd((ullint *)address, (ullint)val);
540: }

542: template <typename Type>
543: struct AtomicAdd {
544:   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
545: };

547: template <>
548: struct AtomicAdd<double> {
549:   __device__ double operator()(double &x, double y) const
550:   {
551:     /* Cuda version does more checks that may be needed */
552:     return atomicAdd(&x, y);
553:   }
554: };

556: template <>
557: struct AtomicAdd<float> {
558:   __device__ float operator()(float &x, float y) const
559:   {
560:     /* Cuda version does more checks that may be needed */
561:     return atomicAdd(&x, y);
562:   }
563: };

565:   #if defined(PETSC_HAVE_COMPLEX)
566: template <>
567: struct AtomicAdd<PetscComplex> {
568:   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
569:   {
570:     PetscComplex         old, *z = &old;
571:     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
572:     AtomicAdd<PetscReal> op;
573:     z[0] = op(xp[0], yp[0]);
574:     z[1] = op(xp[1], yp[1]);
575:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
576:   }
577: };
578:   #endif

580:   /*
581:   Atomic Mult operations:

583:   HIP has no atomicMult at all, so we build our own with atomicCAS
584:  */
585:   #if defined(PETSC_USE_REAL_DOUBLE)
586: __device__ static inline double atomicMult(double *address, double val)
587: {
588:   ullint *address_as_ull = (ullint *)(address);
589:   ullint  old            = *address_as_ull, assumed;
590:   do {
591:     assumed = old;
592:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
593:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
594:   } while (assumed != old);
595:   return __longlong_as_double(old);
596: }
597:   #elif defined(PETSC_USE_REAL_SINGLE)
598: __device__ static float atomicMult(float *address, float val)
599: {
600:   int *address_as_int = (int *)(address);
601:   int  old            = *address_as_int, assumed;
602:   do {
603:     assumed = old;
604:     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
605:   } while (assumed != old);
606:   return __int_as_float(old);
607: }
608:   #endif

610: __device__ static inline int atomicMult(int *address, int val)
611: {
612:   int *address_as_int = (int *)(address);
613:   int  old            = *address_as_int, assumed;
614:   do {
615:     assumed = old;
616:     old     = atomicCAS(address_as_int, assumed, val * assumed);
617:   } while (assumed != old);
618:   return (int)old;
619: }

621: __device__ static inline llint atomicMult(llint *address, llint val)
622: {
623:   ullint *address_as_ull = (ullint *)(address);
624:   ullint  old            = *address_as_ull, assumed;
625:   do {
626:     assumed = old;
627:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
628:   } while (assumed != old);
629:   return (llint)old;
630: }

632: template <typename Type>
633: struct AtomicMult {
634:   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
635: };

637:   /*
638:   Atomic Min/Max operations

640:   See CUDA version for comments.
641:  */
642:   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
643:     #if defined(PETSC_USE_REAL_DOUBLE)
644: __device__ static double atomicMin(double *address, double val)
645: {
646:   ullint *address_as_ull = (ullint *)(address);
647:   ullint  old            = *address_as_ull, assumed;
648:   do {
649:     assumed = old;
650:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
651:   } while (assumed != old);
652:   return __longlong_as_double(old);
653: }

655: __device__ static double atomicMax(double *address, double val)
656: {
657:   ullint *address_as_ull = (ullint *)(address);
658:   ullint  old            = *address_as_ull, assumed;
659:   do {
660:     assumed = old;
661:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
662:   } while (assumed != old);
663:   return __longlong_as_double(old);
664: }
665:     #elif defined(PETSC_USE_REAL_SINGLE)
666: __device__ static float atomicMin(float *address, float val)
667: {
668:   int *address_as_int = (int *)(address);
669:   int  old            = *address_as_int, assumed;
670:   do {
671:     assumed = old;
672:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
673:   } while (assumed != old);
674:   return __int_as_float(old);
675: }

677: __device__ static float atomicMax(float *address, float val)
678: {
679:   int *address_as_int = (int *)(address);
680:   int  old            = *address_as_int, assumed;
681:   do {
682:     assumed = old;
683:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
684:   } while (assumed != old);
685:   return __int_as_float(old);
686: }
687:     #endif
688:   #endif

690:   #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
691: __device__ static llint atomicMin(llint *address, llint val)
692: {
693:   ullint *address_as_ull = (ullint *)(address);
694:   ullint  old            = *address_as_ull, assumed;
695:   do {
696:     assumed = old;
697:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
698:   } while (assumed != old);
699:   return (llint)old;
700: }

702: __device__ static llint atomicMax(llint *address, llint val)
703: {
704:   ullint *address_as_ull = (ullint *)(address);
705:   ullint  old            = *address_as_ull, assumed;
706:   do {
707:     assumed = old;
708:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
709:   } while (assumed != old);
710:   return (llint)old;
711: }
712:   #endif

714: template <typename Type>
715: struct AtomicMin {
716:   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
717: };
718: template <typename Type>
719: struct AtomicMax {
720:   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
721: };

723: /*
724:   Atomic bitwise operations
725:   As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
726: */

728: __device__ static inline llint atomicAnd(llint *address, llint val)
729: {
730:   ullint *address_as_ull = (ullint *)(address);
731:   ullint  old            = *address_as_ull, assumed;
732:   do {
733:     assumed = old;
734:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
735:   } while (assumed != old);
736:   return (llint)old;
737: }
738: __device__ static inline llint atomicOr(llint *address, llint val)
739: {
740:   ullint *address_as_ull = (ullint *)(address);
741:   ullint  old            = *address_as_ull, assumed;
742:   do {
743:     assumed = old;
744:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
745:   } while (assumed != old);
746:   return (llint)old;
747: }

749: __device__ static inline llint atomicXor(llint *address, llint val)
750: {
751:   ullint *address_as_ull = (ullint *)(address);
752:   ullint  old            = *address_as_ull, assumed;
753:   do {
754:     assumed = old;
755:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
756:   } while (assumed != old);
757:   return (llint)old;
758: }

760: template <typename Type>
761: struct AtomicBAND {
762:   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
763: };
764: template <typename Type>
765: struct AtomicBOR {
766:   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
767: };
768: template <typename Type>
769: struct AtomicBXOR {
770:   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
771: };

773: /*
774:   Atomic logical operations:

776:   CUDA has no atomic logical operations at all. We support them on integer types.
777: */

779: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
780:    which is what we want since we only support 32-bit and 64-bit integers.
781:  */
782: template <typename Type, class Op, int size /* sizeof(Type) */>
783: struct AtomicLogical;

785: template <typename Type, class Op>
786: struct AtomicLogical<Type, Op, 4> {
787:   __device__ Type operator()(Type &x, Type y) const
788:   {
789:     int *address_as_int = (int *)(&x);
790:     int  old            = *address_as_int, assumed;
791:     Op   op;
792:     do {
793:       assumed = old;
794:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
795:     } while (assumed != old);
796:     return (Type)old;
797:   }
798: };

800: template <typename Type, class Op>
801: struct AtomicLogical<Type, Op, 8> {
802:   __device__ Type operator()(Type &x, Type y) const
803:   {
804:     ullint *address_as_ull = (ullint *)(&x);
805:     ullint  old            = *address_as_ull, assumed;
806:     Op      op;
807:     do {
808:       assumed = old;
809:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
810:     } while (assumed != old);
811:     return (Type)old;
812:   }
813: };

815: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
816: template <typename Type>
817: struct land {
818:   __device__ Type operator()(Type x, Type y) { return x && y; }
819: };
820: template <typename Type>
821: struct lor {
822:   __device__ Type operator()(Type x, Type y) { return x || y; }
823: };
824: template <typename Type>
825: struct lxor {
826:   __device__ Type operator()(Type x, Type y) { return !x != !y; }
827: };

829: template <typename Type>
830: struct AtomicLAND {
831:   __device__ Type operator()(Type &x, Type y) const
832:   {
833:     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
834:     return op(x, y);
835:   }
836: };
837: template <typename Type>
838: struct AtomicLOR {
839:   __device__ Type operator()(Type &x, Type y) const
840:   {
841:     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
842:     return op(x, y);
843:   }
844: };
845: template <typename Type>
846: struct AtomicLXOR {
847:   __device__ Type operator()(Type &x, Type y) const
848:   {
849:     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
850:     return op(x, y);
851:   }
852: };
853: #endif