• 剩余块用switch处理


    在做循环展开时,处理完整除块后,还需要在剩余块处理。做了个实验对比,用switch加速剩余块段处理

    // switch0.c
    
    #include <stdio.h>
    #include <stdint.h>
    #include <stdlib.h>
    #include <time.h>
    
    int main(int argc, char *argv[])
    {
    	int res = atoi(argv[1]);
    	uint32_t accum = 0;
    	int vec[res], *vec_end = vec + res;
    	for (int i = 0; i < res; i++)
    	{
    		vec[i] = rand();
    	}
    
    	clock_t start = clock();
    
    	for (uint32_t i = UINT32_MAX; i; i--)
    	{
    
    		switch (res)
    		{
    		// case 20:
    		// 	accum += vec[20 - 1];
    		// case 19:
    		// 	accum += vec[19 - 1];
    		// case 18:
    		// 	accum += vec[18 - 1];
    		// case 17:
    		// 	accum += vec[17 - 1];
    		case 16:
    			accum += vec[16 - 1];
    		case 15:
    			accum += vec[15 - 1];
    		case 14:
    			accum += vec[14 - 1];
    		case 13:
    			accum += vec[13 - 1];
    		case 12:
    			accum += vec[12 - 1];
    		case 11:
    			accum += vec[11 - 1];
    		case 10:
    			accum += vec[10 - 1];
    		case 9:
    			accum += vec[9 - 1];
    		case 8:
    			accum += vec[8 - 1];
    		case 7:
    			accum += vec[7 - 1];
    		case 6:
    			accum += vec[6 - 1];
    		case 5:
    			accum += vec[5 - 1];
    		case 4:
    			accum += vec[4 - 1];
    		case 3:
    			accum += vec[3 - 1];
    		case 2:
    			accum += vec[2 - 1];
    		case 1:
    			accum += vec[1 - 1];
    		}
    	}
    
    	clock_t end = clock();
    	printf("%u
    ", accum);
    	printf("%lu
    ", end - start);
    }
    
    
    // switch1.c
    
    #include <stdio.h>
    #include <stdint.h>
    #include <stdlib.h>
    #include <time.h>
    
    int main(int argc, char *argv[])
    {
    	int res = atoi(argv[1]);
    	uint32_t accum = 0;
    	int vec[res], *vec_end = vec + res;
    	for (int i = 0; i < res; i++)
    	{
    		vec[i] = rand();
    	}
    
    	clock_t start = clock();
    
    	for (uint32_t i = UINT32_MAX; i; i--)
    	{
    
    		switch (res)
    		{
    		// case 20:
    		// 	accum += vec_end[-20];
    		// case 19:
    		// 	accum += vec_end[-19];
    		// case 18:
    		// 	accum += vec_end[-18];
    		// case 17:
    		// 	accum += vec_end[-17];
    		case 16:
    			accum += vec_end[-16];
    		case 15:
    			accum += vec_end[-15];
    		case 14:
    			accum += vec_end[-14];
    		case 13:
    			accum += vec_end[-13];
    		case 12:
    			accum += vec_end[-12];
    		case 11:
    			accum += vec_end[-11];
    		case 10:
    			accum += vec_end[-10];
    		case 9:
    			accum += vec_end[-9];
    		case 8:
    			accum += vec_end[-8];
    		case 7:
    			accum += vec_end[-7];
    		case 6:
    			accum += vec_end[-6];
    		case 5:
    			accum += vec_end[-5];
    		case 4:
    			accum += vec_end[-4];
    		case 3:
    			accum += vec_end[-3];
    		case 2:
    			accum += vec_end[-2];
    		case 1:
    			accum += vec_end[-1];
    		}
    	}
    
    	clock_t end = clock();
    	printf("%u
    ", accum);
    	printf("%lu
    ", end - start);
    }
    
    
    filename 剩余块大小为7 剩余块大小为14
    swtich0.c 9265104 18538175
    switch1.c 9250006 18597986

    好像性能差不多(我原以为第二种写法会快一些)。

    另外在datasketches-cpp/common/include/MurmurHash3.h里看到里类似于第一种段写法。另外或许可以用accum数组代替accum变量来进一步加速

    一般intel的cpu的cachelinesize为64

  • 相关阅读:
    Permutation Test 置换检验
    计算机会议排名等级
    国际顶级计算机会议
    机器学习中的范数规则化 L0、L1与L2范数 核范数与规则项参数选择
    岭回归(Ridge Regression)
    popupWindow使用timePicker时点击出现闪屏问题的解决办法
    Java:单例模式的七种写法
    JSONObject遍历获取键值方法合并两个JSONObject
    解决android studio上“com.android.dex.DexIndexOverflowException: method ID not in [0, 0xffff]: 65935”问题
    解决同时共用MOB公司的shareSDK和SMSSDK的冲突问题
  • 原文地址:https://www.cnblogs.com/Tifa-Best/p/14090297.html
Copyright © 2020-2023  润新知