在做循环展开时,处理完整除块后,还需要在剩余块处理。做了个实验对比,用switch加速剩余块段处理
// switch0.c
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[])
{
int res = atoi(argv[1]);
uint32_t accum = 0;
int vec[res], *vec_end = vec + res;
for (int i = 0; i < res; i++)
{
vec[i] = rand();
}
clock_t start = clock();
for (uint32_t i = UINT32_MAX; i; i--)
{
switch (res)
{
// case 20:
// accum += vec[20 - 1];
// case 19:
// accum += vec[19 - 1];
// case 18:
// accum += vec[18 - 1];
// case 17:
// accum += vec[17 - 1];
case 16:
accum += vec[16 - 1];
case 15:
accum += vec[15 - 1];
case 14:
accum += vec[14 - 1];
case 13:
accum += vec[13 - 1];
case 12:
accum += vec[12 - 1];
case 11:
accum += vec[11 - 1];
case 10:
accum += vec[10 - 1];
case 9:
accum += vec[9 - 1];
case 8:
accum += vec[8 - 1];
case 7:
accum += vec[7 - 1];
case 6:
accum += vec[6 - 1];
case 5:
accum += vec[5 - 1];
case 4:
accum += vec[4 - 1];
case 3:
accum += vec[3 - 1];
case 2:
accum += vec[2 - 1];
case 1:
accum += vec[1 - 1];
}
}
clock_t end = clock();
printf("%u
", accum);
printf("%lu
", end - start);
}
// switch1.c
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[])
{
int res = atoi(argv[1]);
uint32_t accum = 0;
int vec[res], *vec_end = vec + res;
for (int i = 0; i < res; i++)
{
vec[i] = rand();
}
clock_t start = clock();
for (uint32_t i = UINT32_MAX; i; i--)
{
switch (res)
{
// case 20:
// accum += vec_end[-20];
// case 19:
// accum += vec_end[-19];
// case 18:
// accum += vec_end[-18];
// case 17:
// accum += vec_end[-17];
case 16:
accum += vec_end[-16];
case 15:
accum += vec_end[-15];
case 14:
accum += vec_end[-14];
case 13:
accum += vec_end[-13];
case 12:
accum += vec_end[-12];
case 11:
accum += vec_end[-11];
case 10:
accum += vec_end[-10];
case 9:
accum += vec_end[-9];
case 8:
accum += vec_end[-8];
case 7:
accum += vec_end[-7];
case 6:
accum += vec_end[-6];
case 5:
accum += vec_end[-5];
case 4:
accum += vec_end[-4];
case 3:
accum += vec_end[-3];
case 2:
accum += vec_end[-2];
case 1:
accum += vec_end[-1];
}
}
clock_t end = clock();
printf("%u
", accum);
printf("%lu
", end - start);
}
filename | 剩余块大小为7 | 剩余块大小为14 |
---|---|---|
swtich0.c | 9265104 | 18538175 |
switch1.c | 9250006 | 18597986 |
好像性能差不多(我原以为第二种写法会快一些)。
另外在datasketches-cpp/common/include/MurmurHash3.h里看到里类似于第一种段写法。另外或许可以用accum数组代替accum变量来进一步加速
一般intel的cpu的cachelinesize为64