#include <stdio.h> #include <unistd.h> int main(int argc, char **argv) { int a[1000][1000]; if(1 == argc) { for(int i = 0; i < 1000; ++i) { for(int j = 0; j < 1000; ++j) { a[i][j] = 0; } } } else { for(int i = 0; i < 1000; ++i) { for(int j = 0; j < 1000; ++j) { a[j][i] = 0; } } } return 0; }
上面有两个小程序片段, 哪段效率高? 显然, 第一段效率高, 为什么呢? 因为在C/C++中,数组是按行存储的,程序的按行访问可以充分利用程序的局部性原理(空间局部性), 用time命令来看看结果:
[root@bogon c++]# g++ miss.c -o miss [root@bogon c++]# ./miss [root@bogon c++]# time ./miss real 0m0.009s user 0m0.009s sys 0m0.000s [root@bogon c++]# time ./miss 1 real 0m0.013s user 0m0.013s sys 0m0.000s [root@bogon c++]# time ./miss real 0m0.010s user 0m0.010s sys 0m0.000s [root@bogon c++]# time ./miss 1 real 0m0.013s user 0m0.013s sys 0m0.000s
[root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss Performance counter stats for './miss': 88,780 L1-dcache-load-misses 0.009002291 seconds time elapsed 0.009174000 seconds user 0.000000000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 1 Performance counter stats for './miss 1': 1,015,683 L1-dcache-load-misses 0.012000335 seconds time elapsed 0.006059000 seconds user 0.006059000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 1 Performance counter stats for './miss 1': 1,015,363 L1-dcache-load-misses 0.012145156 seconds time elapsed 0.006134000 seconds user 0.006134000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 0 Performance counter stats for './miss 0': 1,011,740 L1-dcache-load-misses 0.012363858 seconds time elapsed 0.012484000 seconds user 0.000000000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 0 Performance counter stats for './miss 0': 1,015,347 L1-dcache-load-misses 0.012348778 seconds time elapsed 0.006237000 seconds user 0.006237000 seconds sys