[lab]csappcachelab

Cache lab

该lab主要是对应第六章存储器层次结构.

分为两部分,

A: cpu cache 命中分析,
B: cache 命中优化

Part A.

首先为了实现part A, 我们要安装 valgrind 软件, 它就是用来分析程序运行效率的, --trace-mem 能输出对指定命令的内存读写操作, 命中分析基于它的输出, 在给定 s E b 参数下输出 hit, miss, eviction 的次数. 给出了一个输出的例子

linux> ./csim-ref -v -s 4 -E 1 -b 4 -t traces/yi.trace
L 10,1 miss
M 20,1 miss hit
L 22,1 hit
S 18,1 hit
L 110,1 miss eviction
L 210,1 miss eviction
M 12,1 miss eviction hit
hits:4 misses:5 evictions:3

我们要实现基于LRU淘汰策略的高速缓存, 相应的地址编码, 及数据定义如下

31  b+s   b   0
| CT | CI |CO |


int s, E, e, b, verbose, t;
#define CI(v) (((v) >> b) & ((1<<s) - 1))
#define CO(v) ((v) & ((1<<b) - 1))
#define CT(v) ((v) >> (s + b)) & ((1<<t) - 1)

首先分配 2^s-1 个cache组, 然后循环读取文件中的访问数据, 对地址 addr, 计算出它所在的组和标识符, 在组中查找是否存在, 如果存在, 则更新其访问时间, 否则插入到组中, 并输出命中或者miss. 这里要注意修改的情况, 实际是先将值取出, 再将修改的值写入, 我们不需要真正管理cache的值, 直接默认第二次访问命中即可.

// 获取所在的组和组内标识.
CacheGroupPtr group = cache_groups[CI(addr)];
int mask = CT(addr);
// fprintf(stderr, "idx %d %d %d\n", CI(addr), mask, addr);
verbose ? printf("%s %x,%d", mod, addr, size) : 0;
if (find_item_in_group(group, mask)) {
    // 直接命中.
    hit++;
    verbose ? printf(" hit") : 0;
} else {
    miss++;
    verbose ? printf(" miss") : 0;
    // 没有命中.
    if (insert_item_into_group(group, mask)) {
        eviction++;
        verbose ? printf(" eviction") : 0;
    }
}
if (mod[0] == 'M') {
    hit++;
    // 修改的情况 而外加一次命中.
    verbose ? printf(" hit") : 0;
}
verbose ? puts("") : 0;

cache 我使用链表来模拟, 其中每个节点都是一个cache line, 其中的数据包括:

typedef struct CacheItem {
    struct CacheItem* next;
    int val;
} CacheItem, *CacheItemPtr;

typedef struct CacheGroup {
    CacheItemPtr head;
    int size;
} CacheGroup, *CacheGroupPtr;

链表中,节点的存放顺序就是他们最近访问的次数
当 CacheGroup.size > E 时执行淘汰, 删除最后一个节点即可,
当节点被访问或加入是, 直接插入到链表头部.


CacheItemPtr init_cache_item(int v) {
    CacheItemPtr i = (CacheItemPtr)malloc(sizeof(CacheItem));
    if (i == NULL) {
        exit(1);
    }
    i->val = v;
    i->next = NULL;
    return i;
} 

void clear_cache_item(CacheItemPtr item) {
    if (item == NULL) {
        return;
    }
    clear_cache_item(item->next);
    free(item);
}

CacheGroupPtr init_cache_group() {
    CacheGroupPtr g = (CacheGroupPtr)malloc(sizeof(CacheGroup));
    if (g == NULL) {
        exit(1);
    }
    g->head = init_cache_item(0);
    g->size = 0;
    return g;
}

void clear_cache_group(CacheGroupPtr group) {
    if (group == NULL) {
        return;
    }
    clear_cache_item(group->head);
    free(group);
}

int find_item_in_group(CacheGroupPtr group, int val) {
    CacheItemPtr item = group->head->next;
    CacheItemPtr pre_item = group->head;
    while (item != NULL) {
        if (item->val == val) {
            // move item to first item.
            pre_item->next = item->next;
            item->next = group->head->next;
            group->head->next = item;
            return 1;
        }
        pre_item = item;
        item = item->next;
    }
    return 0;
}

void evict_last_group(CacheGroupPtr group) {
    CacheItemPtr item = group->head->next;
    CacheItemPtr pre_item = group->head;
    while (item->next != NULL) {
        pre_item = item;
        item = item->next;
    }
    clear_cache_item(item);
    pre_item->next = NULL;
    group->size--;
}

int insert_item_into_group(CacheGroupPtr group, int val) {
    int res = 0;
    if (group->size == E) {
        evict_last_group(group);
        res = 1;
    }
    CacheItemPtr item = init_cache_item(val);
    item->next = group->head->next;
    group->head->next = item;
    group->size++;
    return res;
}

Part B

为矩阵转置算法进行 cache 命中优化, cache 参数为 s = 5, E = 1, b = 5, 即块大小32字节, 组内只有一块, 总共32个组, 原始的转置代码如下:

void trans(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, tmp;

    for (i = 0; i < N; i++) {
        for (j = 0; j < M; j++) {
            tmp = A[i][j];
            B[j][i] = tmp;
        }
    }    

}

在解决时一开始没有头绪,走了很多弯路, 首先比较直观的观察

int 大小 4字节, 一个cache line 可以存放 8个字节
矩阵内存是按行存储, 因此 A[i][j] 行访问可以很好的命中 cache, 而B[j][i] 列访问需要我们进行优化.
三种情况 32:32, 64:64, 61:67 可以进行不同的优化.

因此我的第一版思路为对矩阵分成 8*8 的块, 然后按对角线方式遍历, 且函数最多有12个临时变量, 4个作为循环+分块变量, 8个可以用作访问缓存.

|10|13|15|16|
|6 |9 |12|14|
|3 |5 |8 |11|
|1 |2 |4 |7 |

但该方法对 64:64 的情况没什么效果, 这时我查阅了博客, 发现解决问题的关键就是分组+避免冲突, 跟对角线访问顺序没什么关系, 64:64情况下按原来的8个一组会造成冲突, 从而降低效率, 要改进成4个一组.

对于61:67的情况, 由于矩阵大小没有跟cache line对齐, 因此按8个一组就不会冲突. 我们先按8个一组访问, 对不满8个的边界情况直接挨个访问. 以下是我的解答代码


char transpose_64_64_desc[] = "Transpose for 64 64";
void transpose_64_64(int M, int N, int A[N][M], int B[M][N])
{
    // 1653 
    int i,j,ii;
    int jj;
    int arr[8];
    for (i = 0; i < N; i+=8) {
        for (j = 0; j < M; j+=8) {
            for (ii=0;ii<8;++ii) {
                // 只在最里层4步长访问即可
                for (jj=0;jj<4;++jj) {
                    arr[jj] = A[i+ii][j+jj];
                }
                for (jj=0;jj<4;++jj) {
                    B[j+jj][i+ii] = arr[jj];
                }
            }
            for (ii=0;ii<8;++ii) {
                for (jj=4;jj<8;++jj) {
                    arr[jj] = A[i+ii][j+jj];
                }
                for (jj=4;jj<8;++jj) {
                    B[j+jj][i+ii] = arr[jj];
                }
            }
        }
    }    
}


char transpose_general_block8_desc[] = "Transpose for genernal, block is 8";
void transpose_general_block8(int M, int N, int A[N][M], int B[M][N])
{
    // 61:67 2075
    // 32:32 289
#ifndef BLOCK_SIZE
#define BLOCK_SIZE 8
    int i, j, jj, ii;
    int arr[BLOCK_SIZE];
    for (i=0; i+BLOCK_SIZE<=N;i+=BLOCK_SIZE) {
        for (j=0;j+BLOCK_SIZE<=M;j+=BLOCK_SIZE) {
            for (ii=0;ii<BLOCK_SIZE;++ii) {
                for (jj=0;jj<BLOCK_SIZE;++jj) {
                // printf("%d %d\t", i+ii, jj+j);
                    arr[jj] = A[i+ii][jj+j];
                }
                for (jj=0;jj<BLOCK_SIZE;++jj) {
                    B[jj+j][i+ii] = arr[jj];
                }
            }
        }
        for (;j<M;++j) {
            for (ii=0;ii<BLOCK_SIZE;++ii) {
                // printf("%d %d\t", i+ii, jj);
                arr[ii] = A[i+ii][j];
            }
            for (ii=0;ii<BLOCK_SIZE;++ii) {
                B[j][i+ii] = arr[ii];
            }
        }
    }

    for (;i<N;i++) {
        for (j=0;j+BLOCK_SIZE<=M;j+=BLOCK_SIZE) {
            for (jj=0;jj<BLOCK_SIZE;++jj) {
                // printf("%d %d\t", i, jj+j);
                arr[jj] = A[i][jj+j];
            }
            for (jj=0;jj<BLOCK_SIZE;++jj) {
                B[jj+j][i] = arr[jj];
            }
        }
        for (;j<M;++j) {
            B[j][i] = A[i][j];
        }
        // puts("");
    }
#undef BLOCK_SIZE    
#endif //BLOCK_SIZE
}

这次lab对partB的解答其实不够深入, 如果更好的统计cache的 miss 情况, 应该能得到更好的解答.

相关阅读:
基于BIM与点云数据的塔吊仿真系统记录
 ModuleNotFoundError: No module named 'imgaug'
ifc地形数据测试
 Command 'protoc' not found, but can be installed with
运行错误
 java遍历目录下的目录和文件
 给输出框编号
 pytorch yolov5两块gpu训练日志
 matlab atan2
执行的命令
原文地址：https://www.cnblogs.com/xxrlz/p/16096889.html