Actual source code: cupmatomics.hpp
1: #pragma once
3: /*====================================================================================*/
4: /* Atomic operations on device */
5: /*====================================================================================*/
6: #include <petscdevice_cupm.h>
7: #include <petscsystypes.h>
9: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
10: We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
11: */
12: typedef long long int llint;
13: typedef unsigned long long int ullint;
15: #if PetscDefined(USING_NVCC)
16: /*
17: Atomic Insert (exchange) operations
19: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
21: int atomicExch(int* address, int val);
22: unsigned int atomicExch(unsigned int* address, unsigned int val);
23: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
24: float atomicExch(float* address, float val);
26: reads the 32-bit or 64-bit word old located at the address in global or shared
27: memory and stores val back to memory at the same address. These two operations are
28: performed in one atomic transaction. The function returns old.
30: PETSc notes:
32: It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
34: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
35: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
36: be atomic itself.
38: With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
39: insertion is atomic. Hope no user codes rely on that.
40: */
41: __device__ static double atomicExch(double *address, double val)
42: {
43: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
44: }
46: __device__ static llint atomicExch(llint *address, llint val)
47: {
48: return (llint)(atomicExch((ullint *)address, (ullint)val));
49: }
51: template <typename Type>
52: struct AtomicInsert {
53: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
54: };
56: #if defined(PETSC_HAVE_COMPLEX)
57: #if defined(PETSC_USE_REAL_DOUBLE)
58: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
59: template <>
60: struct AtomicInsert<PetscComplex> {
61: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
62: {
63: PetscComplex old, *z = &old;
64: double *xp = (double *)&x, *yp = (double *)&y;
65: AtomicInsert<double> op;
66: z[0] = op(xp[0], yp[0]);
67: z[1] = op(xp[1], yp[1]);
68: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
69: }
70: };
71: #elif defined(PETSC_USE_REAL_SINGLE)
72: template <>
73: struct AtomicInsert<PetscComplex> {
74: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
75: {
76: double *xp = (double *)&x, *yp = (double *)&y;
77: AtomicInsert<double> op;
78: return op(xp[0], yp[0]);
79: }
80: };
81: #endif
82: #endif
84: /*
85: Atomic add operations
87: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
89: int atomicAdd(int* address, int val);
90: unsigned int atomicAdd(unsigned int* address,unsigned int val);
91: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
92: float atomicAdd(float* address, float val);
93: double atomicAdd(double* address, double val);
94: __half2 atomicAdd(__half2 *address, __half2 val);
95: __half atomicAdd(__half *address, __half val);
97: reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
98: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
99: function returns old.
101: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
102: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
103: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
104: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
105: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
106: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
107: */
108: __device__ static llint atomicAdd(llint *address, llint val)
109: {
110: return (llint)atomicAdd((ullint *)address, (ullint)val);
111: }
113: template <typename Type>
114: struct AtomicAdd {
115: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
116: };
118: template <>
119: struct AtomicAdd<double> {
120: __device__ double operator()(double &x, double y) const
121: {
122: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
123: return atomicAdd(&x, y);
124: #else
125: double *address = &x, val = y;
126: ullint *address_as_ull = (ullint *)address;
127: ullint old = *address_as_ull, assumed;
128: do {
129: assumed = old;
130: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
131: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
132: } while (assumed != old);
133: return __longlong_as_double(old);
134: #endif
135: }
136: };
138: template <>
139: struct AtomicAdd<float> {
140: __device__ float operator()(float &x, float y) const
141: {
142: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
143: return atomicAdd(&x, y);
144: #else
145: float *address = &x, val = y;
146: int *address_as_int = (int *)address;
147: int old = *address_as_int, assumed;
148: do {
149: assumed = old;
150: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
151: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
152: } while (assumed != old);
153: return __int_as_float(old);
154: #endif
155: }
156: };
158: #if defined(PETSC_HAVE_COMPLEX)
159: template <>
160: struct AtomicAdd<PetscComplex> {
161: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
162: {
163: PetscComplex old, *z = &old;
164: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
165: AtomicAdd<PetscReal> op;
166: z[0] = op(xp[0], yp[0]);
167: z[1] = op(xp[1], yp[1]);
168: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
169: }
170: };
171: #endif
173: /*
174: Atomic Mult operations:
176: CUDA has no atomicMult at all, so we build our own with atomicCAS
177: */
178: #if defined(PETSC_USE_REAL_DOUBLE)
179: __device__ static double atomicMult(double *address, double val)
180: {
181: ullint *address_as_ull = (ullint *)(address);
182: ullint old = *address_as_ull, assumed;
183: do {
184: assumed = old;
185: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
186: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
187: } while (assumed != old);
188: return __longlong_as_double(old);
189: }
190: #elif defined(PETSC_USE_REAL_SINGLE)
191: __device__ static float atomicMult(float *address, float val)
192: {
193: int *address_as_int = (int *)(address);
194: int old = *address_as_int, assumed;
195: do {
196: assumed = old;
197: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
198: } while (assumed != old);
199: return __int_as_float(old);
200: }
201: #endif
203: __device__ static int atomicMult(int *address, int val)
204: {
205: int *address_as_int = (int *)(address);
206: int old = *address_as_int, assumed;
207: do {
208: assumed = old;
209: old = atomicCAS(address_as_int, assumed, val * assumed);
210: } while (assumed != old);
211: return (int)old;
212: }
214: __device__ static llint atomicMult(llint *address, llint val)
215: {
216: ullint *address_as_ull = (ullint *)(address);
217: ullint old = *address_as_ull, assumed;
218: do {
219: assumed = old;
220: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
221: } while (assumed != old);
222: return (llint)old;
223: }
225: template <typename Type>
226: struct AtomicMult {
227: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
228: };
230: /*
231: Atomic Min/Max operations
233: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
235: int atomicMin(int* address, int val);
236: unsigned int atomicMin(unsigned int* address,unsigned int val);
237: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
239: reads the 32-bit or 64-bit word old located at the address in global or shared
240: memory, computes the minimum of old and val, and stores the result back to memory
241: at the same address. These three operations are performed in one atomic transaction.
242: The function returns old.
243: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
245: atomicMax() is similar.
246: */
248: #if defined(PETSC_USE_REAL_DOUBLE)
249: __device__ static double atomicMin(double *address, double val)
250: {
251: ullint *address_as_ull = (ullint *)(address);
252: ullint old = *address_as_ull, assumed;
253: do {
254: assumed = old;
255: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
256: } while (assumed != old);
257: return __longlong_as_double(old);
258: }
260: __device__ static double atomicMax(double *address, double val)
261: {
262: ullint *address_as_ull = (ullint *)(address);
263: ullint old = *address_as_ull, assumed;
264: do {
265: assumed = old;
266: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
267: } while (assumed != old);
268: return __longlong_as_double(old);
269: }
270: #elif defined(PETSC_USE_REAL_SINGLE)
271: __device__ static float atomicMin(float *address, float val)
272: {
273: int *address_as_int = (int *)(address);
274: int old = *address_as_int, assumed;
275: do {
276: assumed = old;
277: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
278: } while (assumed != old);
279: return __int_as_float(old);
280: }
282: __device__ static float atomicMax(float *address, float val)
283: {
284: int *address_as_int = (int *)(address);
285: int old = *address_as_int, assumed;
286: do {
287: assumed = old;
288: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
289: } while (assumed != old);
290: return __int_as_float(old);
291: }
292: #endif
294: /*
295: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
296: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
297: This causes compilation errors with pgi compilers and 64-bit indices:
298: error: function "atomicMin(long long *, long long)" has already been defined
300: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
301: */
302: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
303: __device__ static llint atomicMin(llint *address, llint val)
304: {
305: ullint *address_as_ull = (ullint *)(address);
306: ullint old = *address_as_ull, assumed;
307: do {
308: assumed = old;
309: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
310: } while (assumed != old);
311: return (llint)old;
312: }
314: __device__ static llint atomicMax(llint *address, llint val)
315: {
316: ullint *address_as_ull = (ullint *)(address);
317: ullint old = *address_as_ull, assumed;
318: do {
319: assumed = old;
320: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
321: } while (assumed != old);
322: return (llint)old;
323: }
324: #endif
326: template <typename Type>
327: struct AtomicMin {
328: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
329: };
330: template <typename Type>
331: struct AtomicMax {
332: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
333: };
335: /*
336: Atomic bitwise operations
338: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
340: int atomicAnd(int* address, int val);
341: unsigned int atomicAnd(unsigned int* address,unsigned int val);
342: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
344: reads the 32-bit or 64-bit word old located at the address in global or shared
345: memory, computes (old & val), and stores the result back to memory at the same
346: address. These three operations are performed in one atomic transaction.
347: The function returns old.
349: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
351: atomicOr() and atomicXor are similar.
352: */
354: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
355: __device__ static llint atomicAnd(llint *address, llint val)
356: {
357: ullint *address_as_ull = (ullint *)(address);
358: ullint old = *address_as_ull, assumed;
359: do {
360: assumed = old;
361: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
362: } while (assumed != old);
363: return (llint)old;
364: }
365: __device__ static llint atomicOr(llint *address, llint val)
366: {
367: ullint *address_as_ull = (ullint *)(address);
368: ullint old = *address_as_ull, assumed;
369: do {
370: assumed = old;
371: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
372: } while (assumed != old);
373: return (llint)old;
374: }
376: __device__ static llint atomicXor(llint *address, llint val)
377: {
378: ullint *address_as_ull = (ullint *)(address);
379: ullint old = *address_as_ull, assumed;
380: do {
381: assumed = old;
382: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
383: } while (assumed != old);
384: return (llint)old;
385: }
386: #endif
388: template <typename Type>
389: struct AtomicBAND {
390: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
391: };
392: template <typename Type>
393: struct AtomicBOR {
394: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
395: };
396: template <typename Type>
397: struct AtomicBXOR {
398: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
399: };
401: /*
402: Atomic logical operations:
404: CUDA has no atomic logical operations at all. We support them on integer types.
405: */
407: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
408: which is what we want since we only support 32-bit and 64-bit integers.
409: */
410: template <typename Type, class Op, int size /* sizeof(Type) */>
411: struct AtomicLogical;
413: template <typename Type, class Op>
414: struct AtomicLogical<Type, Op, 4> {
415: __device__ Type operator()(Type &x, Type y) const
416: {
417: int *address_as_int = (int *)(&x);
418: int old = *address_as_int, assumed;
419: Op op;
420: do {
421: assumed = old;
422: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
423: } while (assumed != old);
424: return (Type)old;
425: }
426: };
428: template <typename Type, class Op>
429: struct AtomicLogical<Type, Op, 8> {
430: __device__ Type operator()(Type &x, Type y) const
431: {
432: ullint *address_as_ull = (ullint *)(&x);
433: ullint old = *address_as_ull, assumed;
434: Op op;
435: do {
436: assumed = old;
437: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
438: } while (assumed != old);
439: return (Type)old;
440: }
441: };
443: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
444: template <typename Type>
445: struct land {
446: __device__ Type operator()(Type x, Type y) { return x && y; }
447: };
448: template <typename Type>
449: struct lor {
450: __device__ Type operator()(Type x, Type y) { return x || y; }
451: };
452: template <typename Type>
453: struct lxor {
454: __device__ Type operator()(Type x, Type y) { return !x != !y; }
455: };
457: template <typename Type>
458: struct AtomicLAND {
459: __device__ Type operator()(Type &x, Type y) const
460: {
461: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
462: return op(x, y);
463: }
464: };
465: template <typename Type>
466: struct AtomicLOR {
467: __device__ Type operator()(Type &x, Type y) const
468: {
469: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
470: return op(x, y);
471: }
472: };
473: template <typename Type>
474: struct AtomicLXOR {
475: __device__ Type operator()(Type &x, Type y) const
476: {
477: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
478: return op(x, y);
479: }
480: };
482: #elif PetscDefined(USING_HCC)
484: /*
485: Atomic Insert (exchange) operations
487: See Cuda version
488: */
489: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
490: __device__ static double atomicExch(double *address, double val)
491: {
492: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
493: }
494: #endif
496: __device__ static llint atomicExch(llint *address, llint val)
497: {
498: return (llint)(atomicExch((ullint *)address, (ullint)val));
499: }
501: template <typename Type>
502: struct AtomicInsert {
503: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
504: };
506: #if defined(PETSC_HAVE_COMPLEX)
507: #if defined(PETSC_USE_REAL_DOUBLE)
508: template <>
509: struct AtomicInsert<PetscComplex> {
510: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
511: {
512: PetscComplex old, *z = &old;
513: double *xp = (double *)&x, *yp = (double *)&y;
514: AtomicInsert<double> op;
515: z[0] = op(xp[0], yp[0]);
516: z[1] = op(xp[1], yp[1]);
517: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
518: }
519: };
520: #elif defined(PETSC_USE_REAL_SINGLE)
521: template <>
522: struct AtomicInsert<PetscComplex> {
523: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
524: {
525: double *xp = (double *)&x, *yp = (double *)&y;
526: AtomicInsert<double> op;
527: return op(xp[0], yp[0]);
528: }
529: };
530: #endif
531: #endif
533: /*
534: Atomic add operations
536: */
537: __device__ static llint atomicAdd(llint *address, llint val)
538: {
539: return (llint)atomicAdd((ullint *)address, (ullint)val);
540: }
542: template <typename Type>
543: struct AtomicAdd {
544: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
545: };
547: template <>
548: struct AtomicAdd<double> {
549: __device__ double operator()(double &x, double y) const
550: {
551: /* Cuda version does more checks that may be needed */
552: return atomicAdd(&x, y);
553: }
554: };
556: template <>
557: struct AtomicAdd<float> {
558: __device__ float operator()(float &x, float y) const
559: {
560: /* Cuda version does more checks that may be needed */
561: return atomicAdd(&x, y);
562: }
563: };
565: #if defined(PETSC_HAVE_COMPLEX)
566: template <>
567: struct AtomicAdd<PetscComplex> {
568: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
569: {
570: PetscComplex old, *z = &old;
571: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
572: AtomicAdd<PetscReal> op;
573: z[0] = op(xp[0], yp[0]);
574: z[1] = op(xp[1], yp[1]);
575: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
576: }
577: };
578: #endif
580: /*
581: Atomic Mult operations:
583: HIP has no atomicMult at all, so we build our own with atomicCAS
584: */
585: #if defined(PETSC_USE_REAL_DOUBLE)
586: __device__ static double atomicMult(double *address, double val)
587: {
588: ullint *address_as_ull = (ullint *)(address);
589: ullint old = *address_as_ull, assumed;
590: do {
591: assumed = old;
592: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
593: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
594: } while (assumed != old);
595: return __longlong_as_double(old);
596: }
597: #elif defined(PETSC_USE_REAL_SINGLE)
598: __device__ static float atomicMult(float *address, float val)
599: {
600: int *address_as_int = (int *)(address);
601: int old = *address_as_int, assumed;
602: do {
603: assumed = old;
604: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
605: } while (assumed != old);
606: return __int_as_float(old);
607: }
608: #endif
610: __device__ static int atomicMult(int *address, int val)
611: {
612: int *address_as_int = (int *)(address);
613: int old = *address_as_int, assumed;
614: do {
615: assumed = old;
616: old = atomicCAS(address_as_int, assumed, val * assumed);
617: } while (assumed != old);
618: return (int)old;
619: }
621: __device__ static llint atomicMult(llint *address, llint val)
622: {
623: ullint *address_as_ull = (ullint *)(address);
624: ullint old = *address_as_ull, assumed;
625: do {
626: assumed = old;
627: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
628: } while (assumed != old);
629: return (llint)old;
630: }
632: template <typename Type>
633: struct AtomicMult {
634: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
635: };
637: /*
638: Atomic Min/Max operations
640: See CUDA version for comments.
641: */
642: #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
643: #if defined(PETSC_USE_REAL_DOUBLE)
644: __device__ static double atomicMin(double *address, double val)
645: {
646: ullint *address_as_ull = (ullint *)(address);
647: ullint old = *address_as_ull, assumed;
648: do {
649: assumed = old;
650: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
651: } while (assumed != old);
652: return __longlong_as_double(old);
653: }
655: __device__ static double atomicMax(double *address, double val)
656: {
657: ullint *address_as_ull = (ullint *)(address);
658: ullint old = *address_as_ull, assumed;
659: do {
660: assumed = old;
661: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
662: } while (assumed != old);
663: return __longlong_as_double(old);
664: }
665: #elif defined(PETSC_USE_REAL_SINGLE)
666: __device__ static float atomicMin(float *address, float val)
667: {
668: int *address_as_int = (int *)(address);
669: int old = *address_as_int, assumed;
670: do {
671: assumed = old;
672: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
673: } while (assumed != old);
674: return __int_as_float(old);
675: }
677: __device__ static float atomicMax(float *address, float val)
678: {
679: int *address_as_int = (int *)(address);
680: int old = *address_as_int, assumed;
681: do {
682: assumed = old;
683: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
684: } while (assumed != old);
685: return __int_as_float(old);
686: }
687: #endif
688: #endif
690: #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
691: __device__ static llint atomicMin(llint *address, llint val)
692: {
693: ullint *address_as_ull = (ullint *)(address);
694: ullint old = *address_as_ull, assumed;
695: do {
696: assumed = old;
697: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
698: } while (assumed != old);
699: return (llint)old;
700: }
702: __device__ static llint atomicMax(llint *address, llint val)
703: {
704: ullint *address_as_ull = (ullint *)(address);
705: ullint old = *address_as_ull, assumed;
706: do {
707: assumed = old;
708: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
709: } while (assumed != old);
710: return (llint)old;
711: }
712: #endif
714: template <typename Type>
715: struct AtomicMin {
716: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
717: };
718: template <typename Type>
719: struct AtomicMax {
720: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
721: };
723: /*
724: Atomic bitwise operations
725: As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
726: */
728: __device__ static llint atomicAnd(llint *address, llint val)
729: {
730: ullint *address_as_ull = (ullint *)(address);
731: ullint old = *address_as_ull, assumed;
732: do {
733: assumed = old;
734: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
735: } while (assumed != old);
736: return (llint)old;
737: }
738: __device__ static llint atomicOr(llint *address, llint val)
739: {
740: ullint *address_as_ull = (ullint *)(address);
741: ullint old = *address_as_ull, assumed;
742: do {
743: assumed = old;
744: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
745: } while (assumed != old);
746: return (llint)old;
747: }
749: __device__ static llint atomicXor(llint *address, llint val)
750: {
751: ullint *address_as_ull = (ullint *)(address);
752: ullint old = *address_as_ull, assumed;
753: do {
754: assumed = old;
755: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
756: } while (assumed != old);
757: return (llint)old;
758: }
760: template <typename Type>
761: struct AtomicBAND {
762: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
763: };
764: template <typename Type>
765: struct AtomicBOR {
766: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
767: };
768: template <typename Type>
769: struct AtomicBXOR {
770: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
771: };
773: /*
774: Atomic logical operations:
776: CUDA has no atomic logical operations at all. We support them on integer types.
777: */
779: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
780: which is what we want since we only support 32-bit and 64-bit integers.
781: */
782: template <typename Type, class Op, int size /* sizeof(Type) */>
783: struct AtomicLogical;
785: template <typename Type, class Op>
786: struct AtomicLogical<Type, Op, 4> {
787: __device__ Type operator()(Type &x, Type y) const
788: {
789: int *address_as_int = (int *)(&x);
790: int old = *address_as_int, assumed;
791: Op op;
792: do {
793: assumed = old;
794: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
795: } while (assumed != old);
796: return (Type)old;
797: }
798: };
800: template <typename Type, class Op>
801: struct AtomicLogical<Type, Op, 8> {
802: __device__ Type operator()(Type &x, Type y) const
803: {
804: ullint *address_as_ull = (ullint *)(&x);
805: ullint old = *address_as_ull, assumed;
806: Op op;
807: do {
808: assumed = old;
809: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
810: } while (assumed != old);
811: return (Type)old;
812: }
813: };
815: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
816: template <typename Type>
817: struct land {
818: __device__ Type operator()(Type x, Type y) { return x && y; }
819: };
820: template <typename Type>
821: struct lor {
822: __device__ Type operator()(Type x, Type y) { return x || y; }
823: };
824: template <typename Type>
825: struct lxor {
826: __device__ Type operator()(Type x, Type y) { return !x != !y; }
827: };
829: template <typename Type>
830: struct AtomicLAND {
831: __device__ Type operator()(Type &x, Type y) const
832: {
833: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
834: return op(x, y);
835: }
836: };
837: template <typename Type>
838: struct AtomicLOR {
839: __device__ Type operator()(Type &x, Type y) const
840: {
841: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
842: return op(x, y);
843: }
844: };
845: template <typename Type>
846: struct AtomicLXOR {
847: __device__ Type operator()(Type &x, Type y) const
848: {
849: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
850: return op(x, y);
851: }
852: };
853: #endif