• CSAPP实验5: cachelab


    理论上还有第个perflab....

    Part A

    之前寒假的时候beginend说过cachelab很难,但是感觉做下来还行?也可能是他把malloclab记错成了cachelab也说不定(

    不管了。Part A就是要按照书上cache memory的组织结构写一个简单的判断器,即给定若干读和写,来判断每次对内存的操作是/否命中缓存。事实上开一个三维数组就好了(如果不实现Blocks甚至只需要二维)

    不过这一次还是学到很多东西的,这里罗列一下

    1. getopt()函数,这个函数来自unistd.h或直接是getopt.h,两者选哪一个取决于C的标准。这个函数实现了从argc, argv中一个一个取出参数的功能,并且提供了[必选参数/可选参数/单独参数]三类参数的提取,很好用。如果有类似--debug这样的参数可能要用上long_getopt()之类的函数
    2. strtok()函数,这个最早见到是在PA lab里面。可以理解为split()函数
    3. sscanf()atoi(),这个可以把字符串转数字。根据数字的进制选择用不用sscanf。事实上还有类似的atof()itoa()sprintf()这样的函数。多看官方文档~
    4. calloc()类似于malloc(),区别在于calloc()会初始化分配的内存为0,常用与数组(回想一下数组的默认初始化)

    个人觉得库函数还是很好用的,至少比自己写要精炼得多了。看来还是要多研究研究别人造过的轮子啊

    难点大概就在于getopt()和如何优雅地取出对应的位,还有就是对M操作的处理。这些都不算太难,写就完了。注意LRU的策略指的是最后一次访问最早的先被删。

    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    #include <getopt.h>
    #include <stdbool.h>
    
    #include "cachelab.h"
    
    #define CMD_ARGS "h::v::s:E:b:t:"
    #define INF 0x7FFFFFFF
    
    typedef long long LL;
    
    typedef struct {
    	int *block;
    	int sign, last;
    	bool used;
    } Set;
    
    Set **Line;
    
    int s, E, b, S, B;
    int hit_cnt, miss_cnt, evict_cnt;
    
    char *filename;
    
    bool debug = false;
    
    void output(char *s) {
    	if (debug) printf("%s", s);
    }
    
    void hit() {
    	output("hit ");
    	++ hit_cnt;
    }
    
    void miss() {
    	output("miss ");
    	++ miss_cnt;
    }
    
    void evict() {
    	output("eviction "); 
    	++ evict_cnt;
    }
    
    FILE *openFile(char *filename) {
    	FILE *fin = fopen(filename, "r");
    	if (fin == NULL) {
    		puts("Error: file not found");
    		exit(-1);
    	}
    	return fin;
    }
    
    void update(int cur_time, int set_no, int block_offset, int sign, int wsiz) {
    	int rec = -1, rec_last = INF;
    	for (int i = 0; i < E; ++ i) {
    		Set *tmp_line = &Line[set_no][i];
    		if (!tmp_line->used) {
    			rec = i;
    			break;
    		}
    		if (tmp_line->last < rec_last) {
    			rec_last = tmp_line->last;
    			rec = i;
    		}
    	}
    	Line[set_no][rec].last = cur_time;
    	Line[set_no][rec].sign = sign;
    
    	if (Line[set_no][rec].used) {
    		evict();
    	} else {
    		Line[set_no][rec].used = true;
    	}
    }
    
    bool load(int cur_time, int set_no, int block_offset, int sign, int wsiz) {
    	for (int i = 0; i < E; ++ i) {
    		Set *tmp_line = &Line[set_no][i];
    		if (tmp_line->used && tmp_line->sign == sign) {
    			tmp_line->last = cur_time;
    			hit();
    			return true;
    		}
    	}
    
    	miss();
    	update(cur_time, set_no, block_offset, sign, wsiz);
    	return false;
    }
    
    void process(FILE *fin) {
    	char *cmd = (char *)malloc(51 * sizeof(char));
    	char *tmp_cmd = cmd;
    	int cur_time = 0;
    	for (fgets(cmd, 30, fin); !feof(fin); cmd = tmp_cmd, fgets(cmd, 30, fin)) {
    		if (cmd[0] == 'I') continue;
    		while ( (*cmd) == ' ') cmd ++;
    		cur_time ++;
    		cmd[strlen(cmd) - 1] = '';
    
    		char *type = strtok(cmd, " ,");
    		char *addr = strtok(NULL, " ,");
    		char *wsiz = strtok(NULL, " ,");
    
    		int wsiz_n = atoi(wsiz);
    		LL addr_n; sscanf(addr, "%llx", &addr_n);
    
    		if (debug) printf("%s %llx,%s ", type, addr_n, wsiz);
    
    		int block_offset = addr_n & (B - 1);
    		int set_no = (addr_n >> b) & (S - 1);
    		int sign = (addr_n >> (s + b) );
    
    		load(cur_time, set_no, block_offset, sign, wsiz_n);
    		if (type[0] == 'M') {
    			load(cur_time, set_no, block_offset, sign, wsiz_n);
    		}
    		output("
    ");
    	}
    	printSummary(hit_cnt, miss_cnt, evict_cnt);
    }
    
    void init() {
    	Line = malloc(S * sizeof(Set *));
    	for (int i = 0; i < S; ++ i) {
    		Line[i] = malloc(E * sizeof(Set));
    		for (int j = 0; j < E; ++ j) {
    			Line[i][j].block = malloc(B * sizeof(int));
    			Line[i][j].used = false;
    		}
    	}
    }
    
    int main(int argc, char *const *argv) {
    	char *filename;
    	for (int opt; ~(opt = getopt(argc, argv, CMD_ARGS)); ) {
    		switch (opt) {
    			case 's': {
    				s = atoi(optarg);
    				S = 1 << s;
    				break;
    			}
    			case 'E': {
    				E = atoi(optarg);
    				break;
    			}
    			case 'b': {
    				b = atoi(optarg);
    				B = 1 << b;
    				break;
    			}
    			case 't': {
    				filename = optarg;
    				break;
    			}
    			case 'v': {
    				debug = true;
    			}
    		}
    	}
    
    	init();
    
    	process(openFile(filename));
    	return 0;
    }
    

    Part B

    写死我了...

    一个最naive的优化就是视频中说的blocking,通过恰当分块就可以实现高效利用cache
    对于32x32的问题,答案很简单就是分成8x8的块,61x67的也类似,难的在于64x64

    难点在于:一次访存会加载连续8个元素,而访问行数大于4时就会出现thrashing,因此用8x8的block会thrashing,用4x4的block则利用不充分
    最后是看了别人的解析才会做的,具体可以看这篇https://www.cnblogs.com/liqiuhao/p/8026100.html
    大概意思就是把8x8分成四个4x4,每次用4x8的方式移动,这样是坠吼的

    /* 
     * transpose_submit - This is the solution transpose function that you
     *     will be graded on for Part B of the assignment. Do not change
     *     the description string "Transpose submission", as the driver
     *     searches for that string to identify the transpose function to
     *     be graded. 
     */
    char transpose_submit_desc[] = "Transpose submission";
    void transpose_submit(int M, int N, int A[N][M], int B[M][N])
    {
        int t0, t1, t2, t3, t4, t5, t6, t7, t8;
        if (M == 64) {
            for (int si = 0; si < N; si += 8) {
                for (int sj = 0; sj < M; sj += 8) {
                    for (int i = si; i < si + 4; ++ i) {
                        t0 = A[i][sj];
                        t1 = A[i][sj + 1];
                        t2 = A[i][sj + 2];
                        t3 = A[i][sj + 3];
                        t4 = A[i][sj + 4];
                        t5 = A[i][sj + 5];
                        t6 = A[i][sj + 6];
                        t7 = A[i][sj + 7];
    
                        B[sj][i] = t0;
                        B[sj + 1][i] = t1;
                        B[sj + 2][i] = t2;
                        B[sj + 3][i] = t3;
                        B[sj][i + 4] = t7;
                        B[sj + 1][i + 4] = t6;
                        B[sj + 2][i + 4] = t5;
                        B[sj + 3][i + 4] = t4;
                    }
                    for (int j = 0; j < 4; ++ j) {
                        t0 = A[si + 4][sj + j + 4];
                        t1 = A[si + 5][sj + j + 4];
                        t2 = A[si + 6][sj + j + 4];
                        t3 = A[si + 7][sj + j + 4];
    
                        t4 = A[si + 4][sj + 3 - j];
                        t5 = A[si + 5][sj + 3 - j];
                        t6 = A[si + 6][sj + 3 - j];
                        t7 = A[si + 7][sj + 3 - j];
    
                        B[sj + j + 4][si] = B[sj + 3 - j][si + 4];
                        B[sj + j + 4][si + 1] = B[sj + 3 - j][si + 5];
                        B[sj + j + 4][si + 2] = B[sj + 3 - j][si + 6];
                        B[sj + j + 4][si + 3] = B[sj + 3 - j][si + 7];
    
                        B[sj + 3 - j][si + 4] = t4;
                        B[sj + 3 - j][si + 5] = t5;
                        B[sj + 3 - j][si + 6] = t6;
                        B[sj + 3 - j][si + 7] = t7;
                        B[sj + j + 4][si + 4] = t0;
                        B[sj + j + 4][si + 5] = t1;
                        B[sj + j + 4][si + 6] = t2;
                        B[sj + j + 4][si + 7] = t3;
                    }
                }
            }
        } else if (M == 61) {
            for (int si = 0; si < N; si += 13) {
                for (int sj = 0; sj < M; sj += 8) {
                    for (int i = si; i < si + 13 && i < N; ++ i) {
                        for (int j = sj; j < sj + 8 && j < M; ++ j) {
                            t0 = A[i][j];
                            B[j][i] = t0;
                        }
                    }
                }
            }
        } else if (M == 32) {
            for (int si = 0; si < N; si += 8) {
                for (int sj = 0; sj < M; sj += 8) {
                    for (int i = si; i < si + 8; ++ i) {
                        t1 = A[i][sj];
                        t2 = A[i][sj + 1];
                        t3 = A[i][sj + 2];
                        t4 = A[i][sj + 3];
                        t5 = A[i][sj + 4];
                        t6 = A[i][sj + 5];
                        t7 = A[i][sj + 6];
                        t8 = A[i][sj + 7];
    
                        B[sj][i] = t1;
                        B[sj + 1][i] = t2;
                        B[sj + 2][i] = t3;
                        B[sj + 3][i] = t4;
                        B[sj + 4][i] = t5;
                        B[sj + 5][i] = t6;
                        B[sj + 6][i] = t7;
                        B[sj + 7][i] = t8;
                    }
                }
            }
        }
    }
    

    本文来自博客园,作者:jjppp。本博客所有文章除特别声明外,均采用CC BY-SA 4.0 协议

  • 相关阅读:
    Vscode开发工具中的Simple React Snippets插件,对React开发有哪些便捷
    函数防抖和节流
    4.怎么样用CSS实现一个loading效果
    3.常见清除浮动的
    2.css处理各种溢出
    1. css画三角形
    2.javascript中call()和apply()区别
    1.js的继承的实现方法
    css、js小技巧
    JS函数声明和函数表达式的关系
  • 原文地址:https://www.cnblogs.com/jjppp/p/14493899.html
Copyright © 2020-2023  润新知