• libyuv 代码结构分析,借用其NEON/ARM64优化代码


    I 入口

    格式转换入口的函数都在convert_xx之类的文件中。在我的android程序中主要用的是xx格式转成NV12。其入口在convert_from.cc中。

    函数为:

    int I420ToNV12(const uint8* src_y,
                   int src_stride_y,
                   const uint8* src_u,
                   int src_stride_u,
                   const uint8* src_v,
                   int src_stride_v,
                   uint8* dst_y,
                   int dst_stride_y,
                   uint8* dst_uv,
                   int dst_stride_uv,
                   int width,
                   int height);

    这里涉及到两个函数:

    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
    MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
                   halfwidth, halfheight);

    其入口都在planar_functions.cc中:

    void CopyPlane(const uint8* src_y,
                   int src_stride_y,
                   uint8* dst_y,
                   int dst_stride_y,
                   int width,
                   int height) {
      int y;
      void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
      // Negative height means invert the image.
      if (height < 0) {
        height = -height;
        dst_y = dst_y + (height - 1) * dst_stride_y;
        dst_stride_y = -dst_stride_y;
      }
      // Coalesce rows.
      if (src_stride_y == width && dst_stride_y == width) {
        width *= height;
        height = 1;
        src_stride_y = dst_stride_y = 0;
      }
      // Nothing to do.
      if (src_y == dst_y && src_stride_y == dst_stride_y) {
        return;
      }
    #if defined(HAS_COPYROW_SSE2)
      if (TestCpuFlag(kCpuHasSSE2)) {
        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
      }
    #endif
    #if defined(HAS_COPYROW_AVX)
      if (TestCpuFlag(kCpuHasAVX)) {
        CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
      }
    #endif
    #if defined(HAS_COPYROW_ERMS)
      if (TestCpuFlag(kCpuHasERMS)) {
        CopyRow = CopyRow_ERMS;
      }
    #endif
    #if defined(HAS_COPYROW_NEON)
      if (TestCpuFlag(kCpuHasNEON)) {
        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
      }
    #endif
    #if defined(HAS_COPYROW_MIPS)
      if (TestCpuFlag(kCpuHasMIPS)) {
        CopyRow = CopyRow_MIPS;
      }
    #endif
    
      // Copy plane
      for (y = 0; y < height; ++y) {
        CopyRow(src_y, dst_y, width);
        src_y += src_stride_y;
        dst_y += dst_stride_y;
      }
    }
    
    void MergeUVPlane(const uint8* src_u,
                      int src_stride_u,
                      const uint8* src_v,
                      int src_stride_v,
                      uint8* dst_uv,
                      int dst_stride_uv,
                      int width,
                      int height) {
      int y;
      void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                         int width) = MergeUVRow_C;
      // Coalesce rows.
      // Negative height means invert the image.
      if (height < 0) {
        height = -height;
        dst_uv = dst_uv + (height - 1) * dst_stride_uv;
        dst_stride_uv = -dst_stride_uv;
      }
      // Coalesce rows.
      if (src_stride_u == width && src_stride_v == width &&
          dst_stride_uv == width * 2) {
        width *= height;
        height = 1;
        src_stride_u = src_stride_v = dst_stride_uv = 0;
      }
    #if defined(HAS_MERGEUVROW_SSE2)
      if (TestCpuFlag(kCpuHasSSE2)) {
        MergeUVRow = MergeUVRow_Any_SSE2;
        if (IS_ALIGNED(width, 16)) {
          MergeUVRow = MergeUVRow_SSE2;
        }
      }
    #endif
    #if defined(HAS_MERGEUVROW_AVX2)
      if (TestCpuFlag(kCpuHasAVX2)) {
        MergeUVRow = MergeUVRow_Any_AVX2;
        if (IS_ALIGNED(width, 32)) {
          MergeUVRow = MergeUVRow_AVX2;
        }
      }
    #endif
    #if defined(HAS_MERGEUVROW_NEON)
      if (TestCpuFlag(kCpuHasNEON)) {
        MergeUVRow = MergeUVRow_Any_NEON;
        if (IS_ALIGNED(width, 16)) {
          MergeUVRow = MergeUVRow_NEON;
        }
      }
    #endif
    #if defined(HAS_MERGEUVROW_MSA)
      if (TestCpuFlag(kCpuHasMSA)) {
        MergeUVRow = MergeUVRow_Any_MSA;
        if (IS_ALIGNED(width, 16)) {
          MergeUVRow = MergeUVRow_MSA;
        }
      }
    #endif
    
      for (y = 0; y < height; ++y) {
        // Merge a row of U and V into a row of UV.
        MergeUVRow(src_u, src_v, dst_uv, width);
        src_u += src_stride_u;
        src_v += src_stride_v;
        dst_uv += dst_stride_uv;
      }
    }
    View Code

    到目前为止的代码都是平台无关的。也很好看懂。

     

    II 平台相关代码

    平台相关代码都在xx_neon.cc xx_neon64.cc中。具体的CopyRow_NEON和MergeUVRow_NEON相关的代码,都在row_neon64.cc/row_neon.cc中。前者是arm64的代码,后者是armeabi-v7a的neon代码。

    // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
    void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
      asm volatile (
      "1:                                          
    "
        MEMACCESS(0)
        "vld1.8     {d0, d1, d2, d3}, [%0]!        
    "  // load 32
        "subs       %2, %2, #32                    
    "  // 32 processed per loop
        MEMACCESS(1)
        "vst1.8     {d0, d1, d2, d3}, [%1]!        
    "  // store 32
        "bgt        1b                             
    "
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(count)  // %2  // Output registers
      :                     // Input registers
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
    }

    CopyRow_Any_NEON和MergeUVRow_Any_NEON的代码都在row_any.cc中,any的代码不分64和32,

    // Any 1 to 1.
    #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                
      void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {         
        SIMD_ALIGNED(uint8 temp[128 * 2]);                                    
        memset(temp, 0, 128); /* for YUY2 and msan */                         
        int r = width & MASK;                                                 
        int n = width & ~MASK;                                                
        if (n > 0) {                                                          
          ANY_SIMD(src_ptr, dst_ptr, n);                                      
        }                                                                     
        memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); 
        ANY_SIMD(temp, temp + 128, MASK + 1);                                 
        memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       
      }
    
    #ifdef HAS_COPYROW_NEON
    ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
    #endif

    代码中用到的申明都在libyuv/row.h中。

    #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
    
    #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
    #if defined(VISUALC_HAS_AVX2)
    #define SIMD_ALIGNED(var) __declspec(align(32)) var
    #else
    #define SIMD_ALIGNED(var) __declspec(align(16)) var
    #endif
    typedef __declspec(align(16)) int16 vec16[8];
    typedef __declspec(align(16)) int32 vec32[4];
    typedef __declspec(align(16)) int8 vec8[16];
    typedef __declspec(align(16)) uint16 uvec16[8];
    typedef __declspec(align(16)) uint32 uvec32[4];
    typedef __declspec(align(16)) uint8 uvec8[16];
    typedef __declspec(align(32)) int16 lvec16[16];
    typedef __declspec(align(32)) int32 lvec32[8];
    typedef __declspec(align(32)) int8 lvec8[32];
    typedef __declspec(align(32)) uint16 ulvec16[16];
    typedef __declspec(align(32)) uint32 ulvec32[8];
    typedef __declspec(align(32)) uint8 ulvec8[32];
    #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
    // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
    #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
    #define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
    #else
    #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
    #endif
    typedef int16 __attribute__((vector_size(16))) vec16;
    typedef int32 __attribute__((vector_size(16))) vec32;
    typedef int8 __attribute__((vector_size(16))) vec8;
    typedef uint16 __attribute__((vector_size(16))) uvec16;
    typedef uint32 __attribute__((vector_size(16))) uvec32;
    typedef uint8 __attribute__((vector_size(16))) uvec8;
    typedef int16 __attribute__((vector_size(32))) lvec16;
    typedef int32 __attribute__((vector_size(32))) lvec32;
    typedef int8 __attribute__((vector_size(32))) lvec8;
    typedef uint16 __attribute__((vector_size(32))) ulvec16;
    typedef uint32 __attribute__((vector_size(32))) ulvec32;
    typedef uint8 __attribute__((vector_size(32))) ulvec8;
    #else
    #define SIMD_ALIGNED(var) var
    typedef int16 vec16[8];
    typedef int32 vec32[4];
    typedef int8 vec8[16];
    typedef uint16 uvec16[8];
    typedef uint32 uvec32[4];
    typedef uint8 uvec8[16];
    typedef int16 lvec16[16];
    typedef int32 lvec32[8];
    typedef int8 lvec8[32];
    typedef uint16 ulvec16[16];
    typedef uint32 ulvec32[8];
    typedef uint8 ulvec8[32];
    #endif
    
    ....

    之后只要把这些代码抠出来便可以借用了。比起交叉编译两个庫来说,方便许多。

  • 相关阅读:
    quartz CronExpression表达式
    nginx配置文件说明
    mysql的日期输出格式列出来
    linux配置定时备份mysql数据库
    Nginx 配置
    查看mysql 服务有哪些ip地址在连接。
    对HelloWorld进行探究
    SpringBoot热部署与启动速度优化
    SpringBoot 快速入门
    初始SpringBoot
  • 原文地址:https://www.cnblogs.com/blowing-in-the-wind/p/6496519.html
Copyright © 2020-2023  润新知