Actual source code: cupmatomics.hpp
1: #pragma once
3: /*====================================================================================*/
4: /* Atomic operations on device */
5: /*====================================================================================*/
6: #include <petscdevice_cupm.h>
7: #include <petscsystypes.h>
9: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
10: We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
11: */
12: typedef long long int llint;
13: typedef unsigned long long int ullint;
15: #if PetscDefined(USING_NVCC)
16: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wunused-function")
17: /*
18: Atomic Insert (exchange) operations
20: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
22: int atomicExch(int* address, int val);
23: unsigned int atomicExch(unsigned int* address, unsigned int val);
24: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
25: float atomicExch(float* address, float val);
27: reads the 32-bit or 64-bit word old located at the address in global or shared
28: memory and stores val back to memory at the same address. These two operations are
29: performed in one atomic transaction. The function returns old.
31: PETSc notes:
33: It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
35: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
36: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
37: be atomic itself.
39: With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
40: insertion is atomic. Hope no user codes rely on that.
41: */
42: __device__ static double atomicExch(double *address, double val)
43: {
44: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
45: }
47: __device__ static llint atomicExch(llint *address, llint val)
48: {
49: return (llint)(atomicExch((ullint *)address, (ullint)val));
50: }
52: template <typename Type>
53: struct AtomicInsert {
54: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
55: };
57: #if defined(PETSC_HAVE_COMPLEX)
58: #if defined(PETSC_USE_REAL_DOUBLE)
59: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
60: template <>
61: struct AtomicInsert<PetscComplex> {
62: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
63: {
64: PetscComplex old, *z = &old;
65: double *xp = (double *)&x, *yp = (double *)&y;
66: AtomicInsert<double> op;
67: z[0] = op(xp[0], yp[0]);
68: z[1] = op(xp[1], yp[1]);
69: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
70: }
71: };
72: #elif defined(PETSC_USE_REAL_SINGLE)
73: template <>
74: struct AtomicInsert<PetscComplex> {
75: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
76: {
77: double *xp = (double *)&x, *yp = (double *)&y;
78: AtomicInsert<double> op;
79: return op(xp[0], yp[0]);
80: }
81: };
82: #endif
83: #endif
85: /*
86: Atomic add operations
88: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
90: int atomicAdd(int* address, int val);
91: unsigned int atomicAdd(unsigned int* address,unsigned int val);
92: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
93: float atomicAdd(float* address, float val);
94: double atomicAdd(double* address, double val);
95: __half2 atomicAdd(__half2 *address, __half2 val);
96: __half atomicAdd(__half *address, __half val);
98: reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
99: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
100: function returns old.
102: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
103: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
104: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
105: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
106: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
107: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
108: */
109: __device__ static llint atomicAdd(llint *address, llint val)
110: {
111: return (llint)atomicAdd((ullint *)address, (ullint)val);
112: }
114: template <typename Type>
115: struct AtomicAdd {
116: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
117: };
119: template <>
120: struct AtomicAdd<double> {
121: __device__ double operator()(double &x, double y) const
122: {
123: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
124: return atomicAdd(&x, y);
125: #else
126: double *address = &x, val = y;
127: ullint *address_as_ull = (ullint *)address;
128: ullint old = *address_as_ull, assumed;
129: do {
130: assumed = old;
131: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
132: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
133: } while (assumed != old);
134: return __longlong_as_double(old);
135: #endif
136: }
137: };
139: template <>
140: struct AtomicAdd<float> {
141: __device__ float operator()(float &x, float y) const
142: {
143: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
144: return atomicAdd(&x, y);
145: #else
146: float *address = &x, val = y;
147: int *address_as_int = (int *)address;
148: int old = *address_as_int, assumed;
149: do {
150: assumed = old;
151: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
152: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
153: } while (assumed != old);
154: return __int_as_float(old);
155: #endif
156: }
157: };
159: #if defined(PETSC_HAVE_COMPLEX)
160: template <>
161: struct AtomicAdd<PetscComplex> {
162: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
163: {
164: PetscComplex old, *z = &old;
165: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
166: AtomicAdd<PetscReal> op;
167: z[0] = op(xp[0], yp[0]);
168: z[1] = op(xp[1], yp[1]);
169: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
170: }
171: };
172: #endif
174: /*
175: Atomic Mult operations:
177: CUDA has no atomicMult at all, so we build our own with atomicCAS
178: */
179: #if defined(PETSC_USE_REAL_DOUBLE)
180: __device__ static double atomicMult(double *address, double val)
181: {
182: ullint *address_as_ull = (ullint *)(address);
183: ullint old = *address_as_ull, assumed;
184: do {
185: assumed = old;
186: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
187: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
188: } while (assumed != old);
189: return __longlong_as_double(old);
190: }
191: #elif defined(PETSC_USE_REAL_SINGLE)
192: __device__ static float atomicMult(float *address, float val)
193: {
194: int *address_as_int = (int *)(address);
195: int old = *address_as_int, assumed;
196: do {
197: assumed = old;
198: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
199: } while (assumed != old);
200: return __int_as_float(old);
201: }
202: #endif
204: __device__ static int atomicMult(int *address, int val)
205: {
206: int *address_as_int = (int *)(address);
207: int old = *address_as_int, assumed;
208: do {
209: assumed = old;
210: old = atomicCAS(address_as_int, assumed, val * assumed);
211: } while (assumed != old);
212: return (int)old;
213: }
215: __device__ static llint atomicMult(llint *address, llint val)
216: {
217: ullint *address_as_ull = (ullint *)(address);
218: ullint old = *address_as_ull, assumed;
219: do {
220: assumed = old;
221: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
222: } while (assumed != old);
223: return (llint)old;
224: }
226: template <typename Type>
227: struct AtomicMult {
228: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
229: };
231: /*
232: Atomic Min/Max operations
234: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
236: int atomicMin(int* address, int val);
237: unsigned int atomicMin(unsigned int* address,unsigned int val);
238: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
240: reads the 32-bit or 64-bit word old located at the address in global or shared
241: memory, computes the minimum of old and val, and stores the result back to memory
242: at the same address. These three operations are performed in one atomic transaction.
243: The function returns old.
244: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
246: atomicMax() is similar.
247: */
249: #if defined(PETSC_USE_REAL_DOUBLE)
250: __device__ static double atomicMin(double *address, double val)
251: {
252: ullint *address_as_ull = (ullint *)(address);
253: ullint old = *address_as_ull, assumed;
254: do {
255: assumed = old;
256: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
257: } while (assumed != old);
258: return __longlong_as_double(old);
259: }
261: __device__ static double atomicMax(double *address, double val)
262: {
263: ullint *address_as_ull = (ullint *)(address);
264: ullint old = *address_as_ull, assumed;
265: do {
266: assumed = old;
267: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
268: } while (assumed != old);
269: return __longlong_as_double(old);
270: }
271: #elif defined(PETSC_USE_REAL_SINGLE)
272: __device__ static float atomicMin(float *address, float val)
273: {
274: int *address_as_int = (int *)(address);
275: int old = *address_as_int, assumed;
276: do {
277: assumed = old;
278: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
279: } while (assumed != old);
280: return __int_as_float(old);
281: }
283: __device__ static float atomicMax(float *address, float val)
284: {
285: int *address_as_int = (int *)(address);
286: int old = *address_as_int, assumed;
287: do {
288: assumed = old;
289: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
290: } while (assumed != old);
291: return __int_as_float(old);
292: }
293: #endif
295: /*
296: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
297: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
298: This causes compilation errors with pgi compilers and 64-bit indices:
299: error: function "atomicMin(long long *, long long)" has already been defined
301: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
302: */
303: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
304: __device__ static llint atomicMin(llint *address, llint val)
305: {
306: ullint *address_as_ull = (ullint *)(address);
307: ullint old = *address_as_ull, assumed;
308: do {
309: assumed = old;
310: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
311: } while (assumed != old);
312: return (llint)old;
313: }
315: __device__ static llint atomicMax(llint *address, llint val)
316: {
317: ullint *address_as_ull = (ullint *)(address);
318: ullint old = *address_as_ull, assumed;
319: do {
320: assumed = old;
321: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
322: } while (assumed != old);
323: return (llint)old;
324: }
325: #endif
327: template <typename Type>
328: struct AtomicMin {
329: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
330: };
331: template <typename Type>
332: struct AtomicMax {
333: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
334: };
336: /*
337: Atomic bitwise operations
339: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
341: int atomicAnd(int* address, int val);
342: unsigned int atomicAnd(unsigned int* address,unsigned int val);
343: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
345: reads the 32-bit or 64-bit word old located at the address in global or shared
346: memory, computes (old & val), and stores the result back to memory at the same
347: address. These three operations are performed in one atomic transaction.
348: The function returns old.
350: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
352: atomicOr() and atomicXor are similar.
353: */
355: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
356: __device__ static llint atomicAnd(llint *address, llint val)
357: {
358: ullint *address_as_ull = (ullint *)(address);
359: ullint old = *address_as_ull, assumed;
360: do {
361: assumed = old;
362: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
363: } while (assumed != old);
364: return (llint)old;
365: }
366: __device__ static llint atomicOr(llint *address, llint val)
367: {
368: ullint *address_as_ull = (ullint *)(address);
369: ullint old = *address_as_ull, assumed;
370: do {
371: assumed = old;
372: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
373: } while (assumed != old);
374: return (llint)old;
375: }
377: __device__ static llint atomicXor(llint *address, llint val)
378: {
379: ullint *address_as_ull = (ullint *)(address);
380: ullint old = *address_as_ull, assumed;
381: do {
382: assumed = old;
383: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
384: } while (assumed != old);
385: return (llint)old;
386: }
387: #endif
389: template <typename Type>
390: struct AtomicBAND {
391: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
392: };
393: template <typename Type>
394: struct AtomicBOR {
395: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
396: };
397: template <typename Type>
398: struct AtomicBXOR {
399: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
400: };
402: /*
403: Atomic logical operations:
405: CUDA has no atomic logical operations at all. We support them on integer types.
406: */
408: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
409: which is what we want since we only support 32-bit and 64-bit integers.
410: */
411: template <typename Type, class Op, int size /* sizeof(Type) */>
412: struct AtomicLogical;
414: template <typename Type, class Op>
415: struct AtomicLogical<Type, Op, 4> {
416: __device__ Type operator()(Type &x, Type y) const
417: {
418: int *address_as_int = (int *)(&x);
419: int old = *address_as_int, assumed;
420: Op op;
421: do {
422: assumed = old;
423: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
424: } while (assumed != old);
425: return (Type)old;
426: }
427: };
429: template <typename Type, class Op>
430: struct AtomicLogical<Type, Op, 8> {
431: __device__ Type operator()(Type &x, Type y) const
432: {
433: ullint *address_as_ull = (ullint *)(&x);
434: ullint old = *address_as_ull, assumed;
435: Op op;
436: do {
437: assumed = old;
438: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
439: } while (assumed != old);
440: return (Type)old;
441: }
442: };
444: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
445: template <typename Type>
446: struct land {
447: __device__ Type operator()(Type x, Type y) { return x && y; }
448: };
449: template <typename Type>
450: struct lor {
451: __device__ Type operator()(Type x, Type y) { return x || y; }
452: };
453: template <typename Type>
454: struct lxor {
455: __device__ Type operator()(Type x, Type y) { return !x != !y; }
456: };
458: template <typename Type>
459: struct AtomicLAND {
460: __device__ Type operator()(Type &x, Type y) const
461: {
462: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
463: return op(x, y);
464: }
465: };
466: template <typename Type>
467: struct AtomicLOR {
468: __device__ Type operator()(Type &x, Type y) const
469: {
470: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
471: return op(x, y);
472: }
473: };
474: template <typename Type>
475: struct AtomicLXOR {
476: __device__ Type operator()(Type &x, Type y) const
477: {
478: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
479: return op(x, y);
480: }
481: };
482: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
483: #elif PetscDefined(USING_HCC)
485: /*
486: Atomic Insert (exchange) operations
488: See Cuda version
489: */
490: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
491: __device__ static double atomicExch(double *address, double val)
492: {
493: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
494: }
495: #endif
497: __device__ static inline llint atomicExch(llint *address, llint val)
498: {
499: return (llint)(atomicExch((ullint *)address, (ullint)val));
500: }
502: template <typename Type>
503: struct AtomicInsert {
504: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
505: };
507: #if defined(PETSC_HAVE_COMPLEX)
508: #if defined(PETSC_USE_REAL_DOUBLE)
509: template <>
510: struct AtomicInsert<PetscComplex> {
511: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
512: {
513: PetscComplex old, *z = &old;
514: double *xp = (double *)&x, *yp = (double *)&y;
515: AtomicInsert<double> op;
516: z[0] = op(xp[0], yp[0]);
517: z[1] = op(xp[1], yp[1]);
518: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
519: }
520: };
521: #elif defined(PETSC_USE_REAL_SINGLE)
522: template <>
523: struct AtomicInsert<PetscComplex> {
524: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
525: {
526: double *xp = (double *)&x, *yp = (double *)&y;
527: AtomicInsert<double> op;
528: return op(xp[0], yp[0]);
529: }
530: };
531: #endif
532: #endif
534: /*
535: Atomic add operations
537: */
538: __device__ static inline llint atomicAdd(llint *address, llint val)
539: {
540: return (llint)atomicAdd((ullint *)address, (ullint)val);
541: }
543: template <typename Type>
544: struct AtomicAdd {
545: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
546: };
548: template <>
549: struct AtomicAdd<double> {
550: __device__ double operator()(double &x, double y) const
551: {
552: /* Cuda version does more checks that may be needed */
553: return atomicAdd(&x, y);
554: }
555: };
557: template <>
558: struct AtomicAdd<float> {
559: __device__ float operator()(float &x, float y) const
560: {
561: /* Cuda version does more checks that may be needed */
562: return atomicAdd(&x, y);
563: }
564: };
566: #if defined(PETSC_HAVE_COMPLEX)
567: template <>
568: struct AtomicAdd<PetscComplex> {
569: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
570: {
571: PetscComplex old, *z = &old;
572: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
573: AtomicAdd<PetscReal> op;
574: z[0] = op(xp[0], yp[0]);
575: z[1] = op(xp[1], yp[1]);
576: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
577: }
578: };
579: #endif
581: /*
582: Atomic Mult operations:
584: HIP has no atomicMult at all, so we build our own with atomicCAS
585: */
586: #if defined(PETSC_USE_REAL_DOUBLE)
587: __device__ static inline double atomicMult(double *address, double val)
588: {
589: ullint *address_as_ull = (ullint *)(address);
590: ullint old = *address_as_ull, assumed;
591: do {
592: assumed = old;
593: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
594: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
595: } while (assumed != old);
596: return __longlong_as_double(old);
597: }
598: #elif defined(PETSC_USE_REAL_SINGLE)
599: __device__ static inline float atomicMult(float *address, float val)
600: {
601: int *address_as_int = (int *)(address);
602: int old = *address_as_int, assumed;
603: do {
604: assumed = old;
605: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
606: } while (assumed != old);
607: return __int_as_float(old);
608: }
609: #endif
611: __device__ static inline int atomicMult(int *address, int val)
612: {
613: int *address_as_int = (int *)(address);
614: int old = *address_as_int, assumed;
615: do {
616: assumed = old;
617: old = atomicCAS(address_as_int, assumed, val * assumed);
618: } while (assumed != old);
619: return (int)old;
620: }
622: __device__ static inline llint atomicMult(llint *address, llint val)
623: {
624: ullint *address_as_ull = (ullint *)(address);
625: ullint old = *address_as_ull, assumed;
626: do {
627: assumed = old;
628: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
629: } while (assumed != old);
630: return (llint)old;
631: }
633: template <typename Type>
634: struct AtomicMult {
635: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
636: };
638: /*
639: Atomic Min/Max operations
641: See CUDA version for comments.
642: */
643: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
644: #if defined(PETSC_USE_REAL_DOUBLE)
645: __device__ static double atomicMin(double *address, double val)
646: {
647: ullint *address_as_ull = (ullint *)(address);
648: ullint old = *address_as_ull, assumed;
649: do {
650: assumed = old;
651: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
652: } while (assumed != old);
653: return __longlong_as_double(old);
654: }
656: __device__ static double atomicMax(double *address, double val)
657: {
658: ullint *address_as_ull = (ullint *)(address);
659: ullint old = *address_as_ull, assumed;
660: do {
661: assumed = old;
662: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
663: } while (assumed != old);
664: return __longlong_as_double(old);
665: }
666: #elif defined(PETSC_USE_REAL_SINGLE)
667: __device__ static float atomicMin(float *address, float val)
668: {
669: int *address_as_int = (int *)(address);
670: int old = *address_as_int, assumed;
671: do {
672: assumed = old;
673: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
674: } while (assumed != old);
675: return __int_as_float(old);
676: }
678: __device__ static float atomicMax(float *address, float val)
679: {
680: int *address_as_int = (int *)(address);
681: int old = *address_as_int, assumed;
682: do {
683: assumed = old;
684: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
685: } while (assumed != old);
686: return __int_as_float(old);
687: }
688: #endif
689: #endif
691: #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
692: __device__ static inline llint atomicMin(llint *address, llint val)
693: {
694: ullint *address_as_ull = (ullint *)(address);
695: ullint old = *address_as_ull, assumed;
696: do {
697: assumed = old;
698: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
699: } while (assumed != old);
700: return (llint)old;
701: }
703: __device__ static inline llint atomicMax(llint *address, llint val)
704: {
705: ullint *address_as_ull = (ullint *)(address);
706: ullint old = *address_as_ull, assumed;
707: do {
708: assumed = old;
709: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
710: } while (assumed != old);
711: return (llint)old;
712: }
713: #endif
715: template <typename Type>
716: struct AtomicMin {
717: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
718: };
719: template <typename Type>
720: struct AtomicMax {
721: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
722: };
724: /*
725: Atomic bitwise operations
726: As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
727: */
729: __device__ static inline llint atomicAnd(llint *address, llint val)
730: {
731: ullint *address_as_ull = (ullint *)(address);
732: ullint old = *address_as_ull, assumed;
733: do {
734: assumed = old;
735: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
736: } while (assumed != old);
737: return (llint)old;
738: }
739: __device__ static inline llint atomicOr(llint *address, llint val)
740: {
741: ullint *address_as_ull = (ullint *)(address);
742: ullint old = *address_as_ull, assumed;
743: do {
744: assumed = old;
745: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
746: } while (assumed != old);
747: return (llint)old;
748: }
750: __device__ static inline llint atomicXor(llint *address, llint val)
751: {
752: ullint *address_as_ull = (ullint *)(address);
753: ullint old = *address_as_ull, assumed;
754: do {
755: assumed = old;
756: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
757: } while (assumed != old);
758: return (llint)old;
759: }
761: template <typename Type>
762: struct AtomicBAND {
763: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
764: };
765: template <typename Type>
766: struct AtomicBOR {
767: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
768: };
769: template <typename Type>
770: struct AtomicBXOR {
771: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
772: };
774: /*
775: Atomic logical operations:
777: CUDA has no atomic logical operations at all. We support them on integer types.
778: */
780: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
781: which is what we want since we only support 32-bit and 64-bit integers.
782: */
783: template <typename Type, class Op, int size /* sizeof(Type) */>
784: struct AtomicLogical;
786: template <typename Type, class Op>
787: struct AtomicLogical<Type, Op, 4> {
788: __device__ Type operator()(Type &x, Type y) const
789: {
790: int *address_as_int = (int *)(&x);
791: int old = *address_as_int, assumed;
792: Op op;
793: do {
794: assumed = old;
795: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
796: } while (assumed != old);
797: return (Type)old;
798: }
799: };
801: template <typename Type, class Op>
802: struct AtomicLogical<Type, Op, 8> {
803: __device__ Type operator()(Type &x, Type y) const
804: {
805: ullint *address_as_ull = (ullint *)(&x);
806: ullint old = *address_as_ull, assumed;
807: Op op;
808: do {
809: assumed = old;
810: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
811: } while (assumed != old);
812: return (Type)old;
813: }
814: };
816: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
817: template <typename Type>
818: struct land {
819: __device__ Type operator()(Type x, Type y) { return x && y; }
820: };
821: template <typename Type>
822: struct lor {
823: __device__ Type operator()(Type x, Type y) { return x || y; }
824: };
825: template <typename Type>
826: struct lxor {
827: __device__ Type operator()(Type x, Type y) { return !x != !y; }
828: };
830: template <typename Type>
831: struct AtomicLAND {
832: __device__ Type operator()(Type &x, Type y) const
833: {
834: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
835: return op(x, y);
836: }
837: };
838: template <typename Type>
839: struct AtomicLOR {
840: __device__ Type operator()(Type &x, Type y) const
841: {
842: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
843: return op(x, y);
844: }
845: };
846: template <typename Type>
847: struct AtomicLXOR {
848: __device__ Type operator()(Type &x, Type y) const
849: {
850: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
851: return op(x, y);
852: }
853: };
854: #endif