原子操作。并且在静态代码和运行时编译两种条件下使用。
▶ 源代码:静态使用
1 #ifndef _SIMPLEATOMICS_KERNEL_H_ 2 #define _SIMPLEATOMICS_KERNEL_H_ 3 //#include "device_launch_parameters.h" 4 5 __global__ void testKernel(int *g_odata) 6 { 7 const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; 8 9 // 算术运算原子指令 10 atomicAdd(&g_odata[0], 10); // 0号位加 10 11 12 atomicSub(&g_odata[1], 10); // 1号位减 10 13 14 atomicExch(&g_odata[2], tid); // 2号位与 tid 号值交换(获得最后一个访问的 tid 号) 15 16 atomicMax(&g_odata[3], tid); // 3号位获得最大的 tid 号 17 18 atomicMin(&g_odata[4], tid); // 4号位获得最小的 tid 号 19 20 atomicInc((unsigned int *)&g_odata[5], 16); // 5号位做模 17 的加法(g_odata[5] == 15 时加 1 得 16,再加 1 得 0) 21 22 atomicDec((unsigned int *)&g_odata[6], 136);// 6号位做模 137 的减法(g_odata[5] == 0 时减 1 得 136,再减 1 得 135) 23 24 atomicCAS(&g_odata[7], tid - 1, tid); // 7号位迭代 (g_odata[7] == tid - 1) ? tid : (g_odata[7]); 25 // 即以 g_odata[7] 初值为起点,增量为 1 的子序列的最大长度(一旦有增量为 1 的元素插到前面去该值就再也不变) 26 // 位运算原子指令 27 atomicAnd(&g_odata[8], 2*tid+7); // 8号位为 1,注意 (2k+7)%2 == 1 但 (2k+7)%(2^m) == 0 或 1,即仅最后一位能保证为 1 28 29 atomicOr(&g_odata[9], 1 << tid); // 9号位为 -1,所有的位均为 1 30 31 atomicXor(&g_odata[10], tid); // 10号位为 255,注意异或运算具有交换律和结合律,硬算 32 } 33 34 #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
1 /*simpleAtomicIntrinsics_cpu.cpp*/ 2 #include <stdio.h> 3 4 extern "C" int computeGold(int *gpuData, const int len); 5 6 int computeGold(int *gpuData, const int len) 7 { 8 if (gpuData[0] != 10 * len) 9 { 10 printf("atomicAdd failed "); 11 return false; 12 } 13 if (gpuData[1] != -10 * len) 14 { 15 printf("atomicSub failed "); 16 return false; 17 } 18 if (gpuData[2] < 0 || gpuData[2] >= len)// gpuData[2] ∈ [0, len) 19 { 20 printf("atomicExch failed "); 21 return false; 22 } 23 if (gpuData[3] != len - 1) 24 { 25 printf("atomicMax failed "); 26 return false; 27 } 28 if (gpuData[4]!=0) 29 { 30 printf("atomicMin failed "); 31 return false; 32 } 33 if (gpuData[5] != len % 17) 34 { 35 printf("atomicInc failed "); 36 return false; 37 } 38 if (gpuData[6] != 137 - len % 137) 39 { 40 printf("atomicDec failed "); 41 return false; 42 } 43 if (gpuData[7] < 0 || gpuData[7] >= len)// gpuData[7] ∈ [0, len) 44 { 45 printf("atomicCAS failed "); 46 return false; 47 } 48 if (gpuData[8] != 1) 49 { 50 printf("atomicAnd failed "); 51 return false; 52 } 53 if (gpuData[9] != -1) 54 { 55 printf("atomicOr failed "); 56 return false; 57 } 58 if (gpuData[10] != 0xff) 59 { 60 printf("atomicXor failed "); 61 return false; 62 } 63 return true; 64 }
1 #include <stdio.h> 2 #include <windows.h> 3 #include <cuda_runtime.h> 4 #include <helper_functions.h> 5 #include <helper_cuda.h> 6 #include "simpleAtomicIntrinsics_kernel.cuh" 7 8 #define WINDOWS_LEAN_AND_MEAN 9 #define NOMINMAX 10 11 extern "C" bool computeGold(int *gpuData, const int len); 12 13 bool runTest() 14 { 15 bool testResult = false; 16 unsigned int numThreads = 256; 17 unsigned int numBlocks = 64; 18 unsigned int numData = 11; 19 unsigned int memSize = sizeof(int) * numData; 20 21 int *h_data = (int *) malloc(memSize); 22 for (unsigned int i = 0; i < numData; h_data[i] = 0, i++); // 初始化为全零 23 h_data[8] = h_data[10] = 0xff; // 搞点非零值 24 25 int *d_data; 26 cudaMalloc((void **) &d_data, memSize); 27 cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); 28 29 // 输出运算前的结果 30 printf(" Before:"); 31 for (int i = 0; i < numData; i++) 32 printf("%8d,", h_data[i]); 33 printf(" "); 34 35 // 计算和计时 36 StopWatchInterface *timer; 37 sdkCreateTimer(&timer); 38 sdkStartTimer(&timer); 39 40 testKernel << <numBlocks, numThreads >> > (d_data); 41 getLastCudaError("Kernel execution failed"); 42 43 sdkStopTimer(&timer); 44 printf(" Processing time: %f ms ", sdkGetTimerValue(&timer)); 45 sdkDeleteTimer(&timer); 46 47 cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost); 48 49 // 输出运算后的结果 50 printf(" After :"); 51 for (int i = 0; i < numData; i++) 52 printf("%8d,", h_data[i]); 53 printf(" "); 54 55 testResult = computeGold(h_data, numThreads * numBlocks); 56 57 free(h_data); 58 cudaFree(d_data); 59 60 return testResult; 61 } 62 63 int main() 64 { 65 bool testResult; 66 67 printf(" Started! "); 68 69 testResult = runTest(); 70 71 printf(" Completed! main function returned %s ", testResult ? "OK!" : "ERROR!"); 72 getchar(); 73 74 return 0; 75 }
▶ 源代码:即时编译
1 /*simpleAtomicIntrinsics_kernel.cuh 发生变化的地方*/ 2 extern "C" __global__ void testKernel(int *g_odata)
1 /*simpleAtomicIntrinsics_cpu.cpp 完全一样*/
1 /*simpleAtomicIntrinsics.cpp*/ 2 #include <stdio.h> 3 #include <windows.h> 4 #include <cuda_runtime.h> 5 #include <nvrtc_helper.h> 6 #include <helper_functions.h>// includes cuda.h and cuda_runtime_api.h 7 8 #define WINDOWS_LEAN_AND_MEAN 9 #define NOMINMAX 10 11 extern "C" bool computeGold(int *gpuData, const int len); 12 13 bool runTest() 14 { 15 bool testResult = false; 16 unsigned int numThreads = 256; 17 unsigned int numBlocks = 64; 18 unsigned int numData = 11; 19 unsigned int memSize = sizeof(int) * numData; 20 21 //即时编译过程 22 char *kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", NULL); 23 char *ptx; 24 size_t ptxSize; 25 compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); 26 CUmodule module = loadPTX(ptx, 1, NULL); 27 CUfunction kernel_addr; 28 cuModuleGetFunction(&kernel_addr, module, "testKernel"); 29 30 int *h_data = (int *) malloc(memSize); 31 for (unsigned int i = 0; i < numData; h_data[i] = 0, i++); 32 h_data[8] = h_data[10] = 0xff; 33 34 CUdeviceptr d_data; 35 cuMemAlloc(&d_data, memSize); 36 cuMemcpyHtoD(d_data, h_data, memSize); 37 38 dim3 cudaBlockSize(numThreads,1,1); 39 dim3 cudaGridSize(numBlocks, 1, 1); 40 void *arr[] = { (void *)&d_data }; 41 cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, 42 cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0); 43 44 cuCtxSynchronize(); 45 46 cuMemcpyDtoH(h_data, d_data, memSize); 47 48 testResult = computeGold(h_data, numThreads * numBlocks); 49 50 free(h_data); 51 cuMemFree(d_data); 52 53 return testResult; 54 } 55 56 int main() 57 { 58 bool testResult; 59 60 printf(" Started! "); 61 62 testResult = runTest(); 63 64 printf(" Completed! main function returned %s ", testResult ? "OK!" : "ERROR!"); 65 getchar(); 66 67 return 0; 68 }
▶ 输出结果:
Started! Before: 0, 0, 0, 0, 0, 0, 0, 0, 255, 0, 255, Processing time: 0.035352 ms After : 163840, -163840, 16287, 16383, 0, 13, 56, 7, 1, -1, 255, Completed! main function returned OK!
▶ 涨姿势
● 一个有趣的数列:命 x0 = 0,xn = xn-1 XOR n,则有 x4n == 4n,x4n+1 = 1, x4n+2 == 4n+3, x4n+3 == 0。当改变初值的时候该表达式发生变化,结果如下图。三种颜色分别代表初始选作右边三个值的时候的结果。
● 解毒 device_atomic_functions.h 与原子操作。只保留了有效部分,去掉了注释和留白。
1 #if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__) 2 #define __DEVICE_ATOMIC_FUNCTIONS_HPP__ 3 4 #if defined(__CUDACC_RTC__) // 主机编译 5 #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __host__ __device__ 6 #else // 设备编译 7 #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ 8 #endif 9 10 #if defined(__cplusplus) && defined(__CUDACC__) 11 12 #include "builtin_types.h" 13 #include "host_defines.h" 14 15 // 整数原子加法。返回 *address 旧值,*address += val; 16 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) 17 { 18 return __iAtomicAdd(address, val); 19 } 20 21 // 无符号整数原子加法 22 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) 23 { 24 return __uAtomicAdd(address, val); 25 } 26 27 // 整数原子减法,注意转换为加法来运算。返回 *address 旧值,*address -= val;。 28 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) 29 { 30 return __iAtomicAdd(address, (unsigned int)-(int)val); 31 } 32 33 // 无符号整数原子减法 34 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) 35 { 36 return __uAtomicAdd(address, (unsigned int)-(int)val); 37 } 38 39 // 整数原子替换。返回 *address 旧值,*address = val; 40 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) 41 { 42 return __iAtomicExch(address, val); 43 } 44 45 // 无符号整数原子替换 46 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) 47 { 48 return __uAtomicExch(address, val); 49 } 50 51 // 浮点原子替换 52 __DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) 53 { 54 return __fAtomicExch(address, val); 55 } 56 57 // 整数原子取小。返回 *address 旧值,*address = MIN(*adress, val); 58 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) 59 { 60 return __iAtomicMin(address, val); 61 } 62 63 // 无符号整数原子取小 64 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) 65 { 66 return __uAtomicMin(address, val); 67 } 68 69 // 整数原子取大。返回 *address 旧值,*address = MAX(*adress, val); 70 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) 71 { 72 return __iAtomicMax(address, val); 73 } 74 75 // 无符号整数原子取大 76 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) 77 { 78 return __uAtomicMax(address, val); 79 } 80 81 // 无符号整数原子模加法。返回 *address 旧值,*address = (*adress + 1) % (val + 1); 82 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) 83 { 84 return __uAtomicInc(address, val); 85 } 86 87 // 无符号整数原子模减法。返回 *address 旧值,*address = (*adress + val) % (val + 1); 88 // 不用 (*adress - 1) 是为了把结果控制在 [0, val] 中,防止变成负数,这与 C 中的 % 运算不同 89 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) 90 { 91 return __uAtomicDec(address, val); 92 } 93 94 // 整数原子按位且。返回 *address 旧值,*adress &= val; 95 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) 96 { 97 return __iAtomicAnd(address, val); 98 } 99 100 // 无符号整数原子按位且 101 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) 102 { 103 return __uAtomicAnd(address, val); 104 } 105 106 // 整数原子按位或。返回 *address 旧值,*adress |= val; 107 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) 108 { 109 return __iAtomicOr(address, val); 110 } 111 112 // 无符号整数原子按位或 113 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) 114 { 115 return __uAtomicOr(address, val); 116 } 117 118 // 整数原子按位异或。返回 *address 旧值,*adress ^= val; 119 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) 120 { 121 return __iAtomicXor(address, val); 122 } 123 124 // 无符号整数原子按位异或 125 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) 126 { 127 return __uAtomicXor(address, val); 128 } 129 130 // 整数原子比较赋值。返回 *address 旧值,*address = (*address == compare) ? val : *address; 131 __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) 132 { 133 return __iAtomicCAS(address, compare, val); 134 } 135 136 // 无符号整数原子比较赋值 137 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) 138 { 139 return __uAtomicCAS(address, compare, val); 140 } 141 142 // 无符号长整数原子加法 143 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) 144 { 145 return __ullAtomicAdd(address, val); 146 } 147 148 // 无符号长整数原子替换 149 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) 150 { 151 return __ullAtomicExch(address, val); 152 } 153 154 // 无符号长整数原子比较赋值 155 __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) 156 { 157 return __ullAtomicCAS(address, compare, val); 158 } 159 160 // 原子存在量词 161 __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond) 162 { 163 return (bool)__any((int)cond); 164 } 165 166 // 原子全称量词 167 __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond) 168 { 169 return (bool)__all((int)cond); 170 } 171 172 #endif /* __cplusplus && __CUDACC__ */ 173 174 #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ 175 176 #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
● 原子操作函数声明在 device_functions.h 中。
当设备计算能力 > 320 或 > 600 时开放各原子操作对应的 block 和 system 函数,并开放对应 long long、float、double 型数据的同一个函数,例如:
1 #if !defined(__CUDACC_RTC__) || __CUDA_ARCH__ >= 600 2 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); 3 4 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); 5 6 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); 7 8 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); 9 10 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val); 11 #endif /* !__CUDACC_RTC__ || __CUDA_ARCH__ >= 600 */
计算能力 600 以上能用的所有原子操作:
1 #define __DEVICE_FUNCTIONS_STATIC_DECL__ __host__ __device__ __cudart_builtin__ 2 3 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd(int *p, int val); 4 5 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_block(int *p, int val); 6 7 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAdd_system(int *p, int val); 8 9 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd(unsigned int *p, unsigned int val); 10 11 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_block(unsigned int *p, unsigned int val); 12 13 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAdd_system(unsigned int *p, unsigned int val); 14 15 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd(unsigned long long *p, unsigned long long val); 16 17 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_block(unsigned long long *p, unsigned long long val); 18 19 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAdd_system(unsigned long long *p, unsigned long long val); 20 21 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd(float *p, float val); 22 23 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_block(float *p, float val); 24 25 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicAdd_system(float *p, float val); 26 27 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd(double *p, double val); 28 29 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_block(double *p, double val); 30 31 __DEVICE_FUNCTIONS_STATIC_DECL__ double __dAtomicAdd_system(double *p, double val); 32 33 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch(int *p, int val); 34 35 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_block(int *p, int val); 36 37 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicExch_system(int *p, int val); 38 39 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch(unsigned int *p, unsigned int val); 40 41 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_block(unsigned int *p, unsigned int val); 42 43 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicExch_system(unsigned int *p, unsigned int val); 44 45 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch(unsigned long long *p, unsigned long long val); 46 47 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_block(unsigned long long *p, unsigned long long val); 48 49 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicExch_system(unsigned long long *p, unsigned long long val); 50 51 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch(float *p, float val); 52 53 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_block(float *p, float val); 54 55 __DEVICE_FUNCTIONS_STATIC_DECL__ float __fAtomicExch_system(float *p, float val); 56 57 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin(int *p, int val); 58 59 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_block(int *p, int val); 60 61 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMin_system(int *p, int val); 62 63 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin(long long *p, long long val); 64 65 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_block(long long *p, long long val); 66 67 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMin_system(long long *p, long long val); 68 69 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin(unsigned int *p, unsigned int val); 70 71 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_block(unsigned int *p, unsigned int val); 72 73 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMin_system(unsigned int *p, unsigned int val); 74 75 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin(unsigned long long *p, unsigned long long val); 76 77 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_block(unsigned long long *p, unsigned long long val); 78 79 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMin_system(unsigned long long *p, unsigned long long val); 80 81 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax(int *p, int val); 82 83 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_block(int *p, int val); 84 85 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicMax_system(int *p, int val); 86 87 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax(long long *p, long long val); 88 89 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_block(long long *p, long long val); 90 91 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __illAtomicMax_system(long long *p, long long val); 92 93 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax(unsigned int *p, unsigned int val); 94 95 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_block(unsigned int *p, unsigned int val); 96 97 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicMax_system(unsigned int *p, unsigned int val); 98 99 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax(unsigned long long *p, unsigned long long val); 100 101 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_block(unsigned long long *p, unsigned long long val); 102 103 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicMax_system(unsigned long long *p, unsigned long long val); 104 105 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc(unsigned int *p, unsigned int val); 106 107 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_block(unsigned int *p, unsigned int val); 108 109 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicInc_system(unsigned int *p, unsigned int val); 110 111 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec(unsigned int *p, unsigned int val); 112 113 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_block(unsigned int *p, unsigned int val); 114 115 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicDec_system(unsigned int *p, unsigned int val); 116 117 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS(int *p, int compare, int val); 118 119 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_block(int *p, int compare, int val); 120 121 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicCAS_system(int *p, int compare, int val); 122 123 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS(unsigned int *p, unsigned int compare, unsigned int val); 124 125 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_block(unsigned int *p, unsigned int compare, unsigned int val); 126 127 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicCAS_system(unsigned int *p, unsigned int compare unsigned int val); 128 129 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS(unsigned long long int *p unsigned long long int compare unsigned long long int val); 130 131 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_block(unsigned long long int *p unsigned long long int compare unsigned long long int val); 132 133 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicCAS_system(unsigned long long int *p unsigned long long int compare unsigned long long int val); 134 135 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd(int *p, int val); 136 137 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_block(int *p, int val); 138 139 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicAnd_system(int *p, int val); 140 141 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicAnd(long long int *p, long long int val); 142 143 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_block(long long *p, long long val); 144 145 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicAnd_system(long long *p, long long val); 146 147 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd(unsigned int *p, unsigned int val); 148 149 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_block(unsigned int *p, unsigned int val); 150 151 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicAnd_system(unsigned int *p, unsigned int val); 152 153 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicAnd(unsigned long long int *p unsigned long long int val); 154 155 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_block(unsigned long long *p, unsigned long long val); 156 157 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicAnd_system(unsigned long long *p, unsigned long long val); 158 159 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr(int *p, int val); 160 161 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_block(int *p, int val); 162 163 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicOr_system(int *p, int val); 164 165 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicOr(long long int *p, long long int val); 166 167 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_block(long long *p, long long val); 168 169 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicOr_system(long long *p, long long val); 170 171 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr(unsigned int *p, unsigned int val); 172 173 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_block(unsigned int *p, unsigned int val); 174 175 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicOr_system(unsigned int *p, unsigned int val); 176 177 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicOr(unsigned long long int *p unsigned long long int val); 178 179 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_block(unsigned long long *p, unsigned long long val); 180 181 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicOr_system(unsigned long long *p, unsigned long long val); 182 183 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor(int *p, int val); 184 185 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_block(int *p, int val); 186 187 __DEVICE_FUNCTIONS_STATIC_DECL__ int __iAtomicXor_system(int *p, int val); 188 189 __DEVICE_FUNCTIONS_STATIC_DECL__ long long int __llAtomicXor(long long int *p, long long int val); 190 191 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_block(long long *p, long long val); 192 193 __DEVICE_FUNCTIONS_STATIC_DECL__ long long __llAtomicXor_system(long long *p, long long val); 194 195 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor(unsigned int *p, unsigned int val); 196 197 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_block(unsigned int *p, unsigned int val); 198 199 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned int __uAtomicXor_system(unsigned int *p, unsigned int val); 200 201 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long int __ullAtomicXor(unsigned long long int *p unsigned long long int val); 202 203 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_block(unsigned long long *p, unsigned long long val); 204 205 __DEVICE_FUNCTIONS_STATIC_DECL__ unsigned long long __ullAtomicXor_system(unsigned long long *p, unsigned long long val);