• ARM NEON 64bit 查找表替换


    没啥效果,如果表的长度在 64个uint8_t之类,应该可以提高查表速度,否则还是C来的快

    #ifdef HAVE_NEON_AARCH64

    void table_lookup_AArch64_neon(uint8_t* lookup_table, uint32_t length, uint8_t* input_ptr, uint8_t* output_ptr)
    {
         /* Load lookup table. */
    
        uint8x16x4_t table0 = vld1q_u8_x4(lookup_table);
        uint8x16x4_t table1 = vld1q_u8_x4(lookup_table+64);
        uint8x16x4_t table2 = vld1q_u8_x4(lookup_table+128);
        uint8x16x4_t table3 = vld1q_u8_x4(lookup_table+192);
        
        uint8x16x4_t elements;
        uint8x16_t src, dst;
        uint8x16_t diff = vmovq_n_s8(64);
    
        for(uint32_t i=0; i<length; i=i+64) {
            uint8_t* ptr = input_ptr+i;
            elements = vld1q_u8_x4(ptr);
            for(uint8_t j=0; j<4; j++) {
                dst = vqtbx4q_u8(dst, table0, src);
                
                src = vsubq_u8(src, diff);
                dst = vqtbx4q_u8(dst, table1, src);
                
                src = vsubq_u8(src, diff);
                dst = vqtbx4q_u8(dst, table2, src);
                
                src = vsubq_u8(src, diff);
                elements.val[j] = vqtbx4q_u8(dst, table3, src);
            }
            vst1q_u8_x4(ptr, elements);
        }
    }
  • 相关阅读:
    2-4 递增链表的插入 链表
    KMPnext数组自看
    Shortest Prefixes POJ
    Xor Sum HDU
    Immediate Decodability HDU
    Repository HDU
    "strcmp()" Anyone? UVA
    Remember the Word UVALive
    A Magic Lamp HDU
    Check Corners HDU
  • 原文地址:https://www.cnblogs.com/awiki/p/10950474.html
Copyright © 2020-2023  润新知