• 访存加速-Speed-up of Memory Access Intensive Program


    【参考网址】
    关于采用PLD指令进行内存预取减少内存操作指令的等待时间:
    https://www.jianshu.com/p/7b3bfc3aed12
    关于L1,L2,L3的缓存介绍:
    https://www.cnblogs.com/arnoldlu/p/7883663.html
    关于乱序处理和顺序处理的解释:(本质上不存在乱序,只是指令之间的依赖关系)
    https://www.sohu.com/a/127028459_505803
    实际中配合NEON的v_load指令似乎并没有起到任何作用,不仅如此反而有些变慢了。U_U

    代码如下:

    static inline void prefetch_range(uint8_t *addr, size_t len) { uint8_t *cp; uint8_t *end = addr + len; for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 0, 3); }

    static inline void prestore_range(uint8_t addr, size_t len){
    uint8_t
    cp;
    uint8_t *end = addr + len;
    for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 1, 3);
    }

    void GetPixelOrder_CPU(cv::Mat & order, const cv::Mat & grey){
    uint8_t data_order = order.data;
    uint8_t
    data_grey = grey.data;
    if(order.type() != CV_8UC1 ||
    grey.type() != CV_8UC1 ||
    order.size() != grey.size()){
    CAP_LOGE("[GetPixelOrder] invalid inputs");
    exit(-1);
    }
    int h = grey.size().height;
    int w = grey.size().width;
    int offset;
    int idx;
    uint8_t buf[8]; // window is a 4 x 2 ( width x height ) rectangle
    uint8_t center; // center is in fact the left corner pixel in window
    for(int i=0; i<h-1; ++i){
    offset = i * w;
    idx = 6;
    buf[0] = data_grey[offset];
    buf[1] = data_grey[offset + w];

        </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">2</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">];</span><span class="sc0">
        </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">3</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">];</span><span class="sc0">
    
        </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">4</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">];</span><span class="sc0">
        </span><span class="sc11">buf</span><span class="sc10">[</span><span class="sc4">5</span><span class="sc10">]</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">data_grey</span><span class="sc10">[</span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">];</span><span class="sc0">
        </span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">&lt;</span><span class="sc11">w</span><span class="sc10">-</span><span class="sc4">3</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc10">++</span><span class="sc11">j</span><span class="sc10">){</span><span class="sc0">
            </span><span class="sc2">// update the outdated pixels with new ones coming in by the right side
    

    buf[idx] = data_grey[offset + 3];
    buf[idx + 1] = data_grey[offset + w + 3];
    // update window position
    ++offset;
    // update buffer state: position of outdating pixels
    idx = (idx + 2) % 8;
    // the center of window is always the up-left one
    center = buf[idx];
    // calculate the order for center point within current window
    uint8_t counter = 0;
    for(int k=1; k<8; ++k) counter += (buf[(idx + k)%8] > center);
    data_order[offset] = counter << 5;
    }
    }
    }

    void GetPixelOrder_NEON(cv::Mat & order, const cv::Mat & grey){
    uint8_t data_order = order.data;
    uint8_t
    data_grey = grey.data;
    if(order.type() != CV_8UC1 ||
    grey.type() != CV_8UC1 ||
    order.size() != grey.size()){
    CAP_LOGE("[GetPixelOrder] invalid inputs");
    exit(-1);
    }
    int h = grey.size().height;
    int w = grey.size().width;
    int offset;

    </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_uint8x16</span><span class="sc0"> </span><span class="sc11">v_b_0</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_1</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_2</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_3</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_4</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_5</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_6</span><span class="sc10">,</span><span class="sc0"> </span><span class="sc11">v_b_7</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc2">// 4(w) x 2(h) window
    

    cv::v_uint8x16 v_counter;
    cv::v_uint8x16 v_flag;

    </span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">y_end</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">h</span><span class="sc0"> </span><span class="sc10">-</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">;</span><span class="sc0">
    </span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">x_end</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">-</span><span class="sc0"> </span><span class="sc4">15</span><span class="sc10">;</span><span class="sc0">
    
    </span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc10">&lt;</span><span class="sc11">y_end</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc10">++</span><span class="sc11">i</span><span class="sc10">){</span><span class="sc0">
        </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">i</span><span class="sc0"> </span><span class="sc10">*</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc10">;</span><span class="sc0">
        </span><span class="sc5">for</span><span class="sc10">(</span><span class="sc16">int</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">=</span><span class="sc4">0</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">&lt;</span><span class="sc11">x_end</span><span class="sc10">;</span><span class="sc0"> </span><span class="sc11">j</span><span class="sc10">+=</span><span class="sc4">16</span><span class="sc10">){</span><span class="sc0">
            </span><span class="sc2">// memory hint instructions: PLD Commands to enable data prefetching
    

    //prefetch_range(data_grey + offset, 24);
    //prefetch_range(data_grey + offset + w, 24);
    //prestore_range(data_order + offset, 16);

    // update window
    v_b_0 = cv::v_load(data_grey + offset);
    v_b_2 = cv::v_load(data_grey + offset + 1);
    v_b_4 = cv::v_load(data_grey + offset + 2);
    v_b_6 = cv::v_load(data_grey + offset + 3);

            </span><span class="sc11">v_b_1</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc10">);</span><span class="sc0">
            </span><span class="sc11">v_b_3</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">1</span><span class="sc10">);</span><span class="sc0">
            </span><span class="sc11">v_b_5</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">2</span><span class="sc10">);</span><span class="sc0">
            </span><span class="sc11">v_b_7</span><span class="sc0"> </span><span class="sc10">=</span><span class="sc0"> </span><span class="sc11">cv</span><span class="sc10">::</span><span class="sc11">v_load</span><span class="sc10">(</span><span class="sc11">data_grey</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">offset</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc11">w</span><span class="sc0"> </span><span class="sc10">+</span><span class="sc0"> </span><span class="sc4">3</span><span class="sc10">);</span><span class="sc0">
    
            </span><span class="sc2">// calculate the order for center point within current window
    

    v_counter = cv::v_setall_u8(0);
    v_flag = v_b_0 < v_b_1;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_2;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_3;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_4;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_5;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_6;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_flag = v_b_0 < v_b_7;
    v_flag = v_flag >> 7;
    v_counter = v_counter + v_flag;
    v_counter = v_counter << 5;
    cv::v_store(data_order + offset, v_counter);
    // update window position
    offset += 16;
    }
    }
    }

    CPU版本:17-26ms(手机运行不稳定)
    NEON版本:1-2ms(绝大部分时间是1ms)
    NEON+PLD/PST(预取/存):3-7ms(不稳定)
    将预取prefetch_range以及prestore_range加上后稍微稳定些了,但是速度变慢了。
    因此还需要认真思考预取的使用策略,使其生效。

  • 相关阅读:
    最小圆覆盖
    BZOJ3572 [Hnoi2014]世界树 【虚树 + 树形dp】
    一些组合数学
    BZOJ3611 [Heoi2014]大工程 【虚树】
    线段树合并
    BZOJ4446 [Scoi2015]小凸玩密室 【树形Dp】
    生成函数小记
    BZOJ2337 [HNOI2011]XOR和路径 【概率dp + 高斯消元】
    连续数字异或和
    POJ2976:Dropping tests——题解
  • 原文地址:https://www.cnblogs.com/thisisajoke/p/12017657.html
Copyright © 2020-2023  润新知