#include <stdio.h> #include <sys/time.h> int main() { volatile int m; struct timeval start; gettimeofday(&start, NULL); for (int i = 0; i < 1000000; i++) { m++; } struct timeval end; gettimeofday(&end, NULL); printf("add cost %lldus ", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); int n; gettimeofday(&start, NULL); for (int i = 0; i < 1000000; i++) { __sync_fetch_and_add(&n, 1); } gettimeofday(&end, NULL); printf("atomic cost %lldus ", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); return 0; }
之所以用volatile修饰m是拒绝编译器对m++做优化。
使用O2编译并查看性能:
$gcc -O2 -std=c99 -o perf atomic_perf.c $./perf add cost 2638us atomic cost 8510us
可见如果你的变量压根不会被多线程访问,并且对性能极度苛刻的话,还是不要用原子变量了吧。因为在有些平台上“A full memory barrier is created when this function is invoked”。
可以通过下面的方法看到m++和原子操作的汇编之间的区别:
$gcc -O2 -std=c99 -g -c atomic_perf.c $objdump -Sl atomic_perf.o atomic_perf.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <main>: main(): /home/admin/jinxin/test/atomic_perf.c:5 #include <stdio.h> #include <sys/time.h> int main() { 0: 55 push %rbp /home/admin/jinxin/test/atomic_perf.c:9 volatile int m; struct timeval start; gettimeofday(&start, NULL); 1: 31 f6 xor %esi,%esi /home/admin/jinxin/test/atomic_perf.c:5 3: 53 push %rbx 4: 48 83 ec 38 sub $0x38,%rsp /home/admin/jinxin/test/atomic_perf.c:9 8: 48 8d 6c 24 10 lea 0x10(%rsp),%rbp d: 48 89 ef mov %rbp,%rdi 10: e8 00 00 00 00 callq 15 <main+0x15> 15: 31 d2 xor %edx,%edx /home/admin/jinxin/test/atomic_perf.c:11 for (int i = 0; i < 1000000; i++) { m++; 17: 8b 44 24 2c mov 0x2c(%rsp),%eax /home/admin/jinxin/test/atomic_perf.c:10 1b: 83 c2 01 add $0x1,%edx /home/admin/jinxin/test/atomic_perf.c:11 1e: 83 c0 01 add $0x1,%eax /home/admin/jinxin/test/atomic_perf.c:10 21: 81 fa 40 42 0f 00 cmp $0xf4240,%edx /home/admin/jinxin/test/atomic_perf.c:11 27: 89 44 24 2c mov %eax,0x2c(%rsp) /home/admin/jinxin/test/atomic_perf.c:10 2b: 75 ea jne 17 <main+0x17> /home/admin/jinxin/test/atomic_perf.c:14 } struct timeval end; gettimeofday(&end, NULL); 2d: 31 f6 xor %esi,%esi 2f: 48 89 e7 mov %rsp,%rdi 32: e8 00 00 00 00 callq 37 <main+0x37> /home/admin/jinxin/test/atomic_perf.c:16 printf("add cost %lldus ", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); 37: 48 8b 04 24 mov (%rsp),%rax 3b: 48 2b 44 24 10 sub 0x10(%rsp),%rax 40: bf 00 00 00 00 mov $0x0,%edi 45: 48 8b 74 24 08 mov 0x8(%rsp),%rsi 4a: 48 2b 74 24 18 sub 0x18(%rsp),%rsi 4f: 48 69 c0 40 42 0f 00 imul $0xf4240,%rax,%rax 56: 48 01 c6 add %rax,%rsi 59: 31 c0 xor %eax,%eax 5b: e8 00 00 00 00 callq 60 <main+0x60> /home/admin/jinxin/test/atomic_perf.c:19 int n; gettimeofday(&start, NULL); 60: 31 f6 xor %esi,%esi 62: 48 89 ef mov %rbp,%rdi 65: e8 00 00 00 00 callq 6a <main+0x6a> 6a: 48 8d 54 24 28 lea 0x28(%rsp),%rdx 6f: 31 c0 xor %eax,%eax /home/admin/jinxin/test/atomic_perf.c:21 for (int i = 0; i < 1000000; i++) { __sync_fetch_and_add(&n, 1); 71: f0 83 02 01 lock addl $0x1,(%rdx) /home/admin/jinxin/test/atomic_perf.c:20 75: 83 c0 01 add $0x1,%eax 78: 3d 40 42 0f 00 cmp $0xf4240,%eax 7d: 75 f2 jne 71 <main+0x71> /home/admin/jinxin/test/atomic_perf.c:23 } gettimeofday(&end, NULL); 7f: 48 89 e7 mov %rsp,%rdi 82: 31 f6 xor %esi,%esi 84: e8 00 00 00 00 callq 89 <main+0x89> /home/admin/jinxin/test/atomic_perf.c:24 printf("atomic cost %lldus ", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); 89: 48 8b 04 24 mov (%rsp),%rax 8d: 48 2b 44 24 10 sub 0x10(%rsp),%rax 92: bf 00 00 00 00 mov $0x0,%edi 97: 48 8b 74 24 08 mov 0x8(%rsp),%rsi 9c: 48 2b 74 24 18 sub 0x18(%rsp),%rsi a1: 48 69 c0 40 42 0f 00 imul $0xf4240,%rax,%rax a8: 48 01 c6 add %rax,%rsi ab: 31 c0 xor %eax,%eax ad: e8 00 00 00 00 callq b2 <main+0xb2> /home/admin/jinxin/test/atomic_perf.c:27 return 0; } b2: 48 83 c4 38 add $0x38,%rsp b6: 31 c0 xor %eax,%eax b8: 5b pop %rbx b9: 5d pop %rbp ba: c3 retq