• SSE2 Intrinsics各函数介绍[转]


    SIMD相关头文件包括:

    //#include <ivec.h>//MMX
    //#include <fvec.h>//SSE(also include ivec.h)
    //#include <dvec.h>//SSE2(also include fvec.h)
    
    
    #include <mmintrin.h> //MMX
    #include <xmmintrin.h> //SSE(include mmintrin.h)
    #include <emmintrin.h> //SSE2(include xmmintrin.h)
    #include <pmmintrin.h> //SSE3(include emmintrin.h)
    #include <tmmintrin.h>//SSSE3(include pmmintrin.h)
    #include <smmintrin.h>//SSE4.1(include tmmintrin.h)
    #include <nmmintrin.h>//SSE4.2(include smmintrin.h)
    #include <wmmintrin.h>//AES(include nmmintrin.h)
    #include <immintrin.h>//AVX(include wmmintrin.h)
    #include <intrin.h>//(include immintrin.h)

     mmintrin.h为MMX 头文件,其中__m64的定义为:

    typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
    {
        unsigned __int64    m64_u64;
        float               m64_f32[2];
        __int8              m64_i8[8];
        __int16             m64_i16[4];
        __int32             m64_i32[2];    
        __int64             m64_i64;
        unsigned __int8     m64_u8[8];
        unsigned __int16    m64_u16[4];
        unsigned __int32    m64_u32[2];
    } __m64;

    xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

    typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
         float               m128_f32[4];
         unsigned __int64    m128_u64[2];
         __int8              m128_i8[16];
         __int16             m128_i16[8];
         __int32             m128_i32[4];
         __int64             m128_i64[2];
         unsigned __int8     m128_u8[16];
         unsigned __int16    m128_u16[8];
         unsigned __int32    m128_u32[4];
     } __m128;

    emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:

    typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
        __int8              m128i_i8[16];
        __int16             m128i_i16[8];
        __int32             m128i_i32[4];    
        __int64             m128i_i64[2];
        unsigned __int8     m128i_u8[16];
        unsigned __int16    m128i_u16[8];
        unsigned __int32    m128i_u32[4];
        unsigned __int64    m128i_u64[2];
    } __m128i;
    
    typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
        double              m128d_f64[2];
    } __m128d;

    emmintrin.h文件中各函数的介绍:

      1     /*----Floating-Point Intrinsics Using Streaming SIMD Extension 2 Instructions----*/
      2     //Arithmetic Operations(Floating Point):add、sub、mul、div、sqrt、min、max
      3     //返回一个__m128d的寄存器,r0=_A0+_B0, r1=_A1
      4     extern __m128d _mm_add_sd(__m128d _A, __m128d _B);
      5     //返回一个__m128d的寄存器,r0=_A0+_B0, r1=_A1+_B1
      6     extern __m128d _mm_add_pd(__m128d _A, __m128d _B);
      7     //返回一个__m128d的寄存器,r0=_A0-_B0, r1=_A1
      8     extern __m128d _mm_sub_sd(__m128d _A, __m128d _B);
      9     //返回一个__m128d的寄存器,r0=_A0-_B0, r1=_A1-_B1
     10     extern __m128d _mm_sub_pd(__m128d _A, __m128d _B);
     11     //返回一个__m128d的寄存器,r0=_A0*_B0, r1=_A1
     12     extern __m128d _mm_mul_sd(__m128d _A, __m128d _B);
     13     //返回一个__m128d的寄存器,r0=_A0*_B0, r1=_A1*_B1
     14     extern __m128d _mm_mul_pd(__m128d _A, __m128d _B);
     15     //返回一个__m128d的寄存器,r0=sqrt(_B0), r1=_A1
     16     extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B);
     17     //返回一个__m128d的寄存器,r0=sqrt(_A0), r1=sqrt(_A1)
     18     extern __m128d _mm_sqrt_pd(__m128d _A);
     19     //返回一个__m128d的寄存器,r0=_A0/_B0, r1=_A1
     20     extern __m128d _mm_div_sd(__m128d _A, __m128d _B);
     21     //返回一个__m128d的寄存器,r0=_A0/_B0, r1=_A1/_B1
     22     extern __m128d _mm_div_pd(__m128d _A, __m128d _B);
     23     //返回一个__m128d的寄存器,r0=min(_A0,_B0), r1=_A1
     24     extern __m128d _mm_min_sd(__m128d _A, __m128d _B);
     25     //返回一个__m128d的寄存器,r0=min(_A0,_B0), r1=min(_A1,_B1)
     26     extern __m128d _mm_min_pd(__m128d _A, __m128d _B);
     27     //返回一个__m128d的寄存器,r0=max(_A0,_B0), r1=_A1
     28     extern __m128d _mm_max_sd(__m128d _A, __m128d _B);
     29     //返回一个__m128d的寄存器,r0=max(_A0,_B0), r1=max(_A1,_B1)
     30     extern __m128d _mm_max_pd(__m128d _A, __m128d _B);
     31 
     32     //Logical Operations(Floating Point SSE2 Intrinsics):and、or、xor、 andnot
     33     //返回一个__m128d的寄存器,r0=_A0 & _B0, r1=_A1 & _B1
     34     extern __m128d _mm_and_pd(__m128d _A, __m128d _B);
     35     //返回一个__m128d的寄存器,r0=(~_A0) & _B0, r1=(~_A1) & _B1
     36     extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B);
     37     //返回一个__m128d的寄存器,r0=_A0 | _B0, r1=_A1 | _B1
     38     extern __m128d _mm_or_pd(__m128d _A, __m128d _B);
     39     //返回一个__m128d的寄存器,r0=_A0 ^ _B0, r1=_A1 ^ _B1
     40     extern __m128d _mm_xor_pd(__m128d _A, __m128d _B);
     41 
     42     //Comparisions:==、<、<=、>、>=、!=
     43     //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     44     extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B);
     45     //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0, 
     46     //r1=(_A1 == _B1) ? 0xffffffffffffffff : 0x0
     47     extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B);
     48     //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     49     extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B);
     50     //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0, 
     51     //r1=(_A1 < _B1) ? 0xffffffffffffffff : 0x0
     52     extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B);
     53     //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     54     extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B);
     55     //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, 
     56     //r1=(_A1 <= _B1) ? 0xffffffffffffffff : 0x0
     57     extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B);
     58     //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     59     extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B);
     60     //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0, 
     61     //r1=(_A1 > _B1) ? 0xffffffffffffffff : 0x0
     62     extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B);
     63     //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     64     extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B);
     65     //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, 
     66     //r1=(_A1 >= _B1) ? 0xffffffffffffffff : 0x0
     67     extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B);
     68     //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     69     extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B);
     70     //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0, 
     71     //r1=(_A1 != _B1) ? 0xffffffffffffffff : 0x0
     72     extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B);
     73     //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     74     extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B);
     75     //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0, 
     76     //r1=!(_A1 < _B1) ? 0xffffffffffffffff : 0x0
     77     extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B);
     78     //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     79     extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B);
     80     //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, 
     81     //r1=!(_A1 <= _B1) ? 0xffffffffffffffff : 0x0
     82     extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B);
     83     //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     84     extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B);
     85     //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0, 
     86     //r1=!(_A1 > _B1) ? 0xffffffffffffffff : 0x0
     87     extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B);
     88     //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     89     extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B);
     90     //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0,
     91     //r1=!(_A1 >= _B1) ? 0xffffffffffffffff : 0x0
     92     extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B);
     93     //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0, 
     94     //r1=(_A1 ord _B1) ? 0xffffffffffffffff : 0x0
     95     extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B);
     96     //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
     97     extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B);
     98     //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0, 
     99     //r1=(_A1 unord _B1) ? 0xffffffffffffffff : 0x0
    100     extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B);
    101     //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0, r1=_A1
    102     extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B);
    103     //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    104     extern int _mm_comieq_sd(__m128d _A, __m128d _B);
    105     //返回一个0或1的整数,r=(_A0 < _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    106     extern int _mm_comilt_sd(__m128d _A, __m128d _B);
    107     //返回一个0或1的整数,r=(_A0 <= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    108     extern int _mm_comile_sd(__m128d _A, __m128d _B);
    109     //返回一个0或1的整数,r=(_A0 > _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    110     extern int _mm_comigt_sd(__m128d _A, __m128d _B);
    111     //返回一个0或1的整数,r=(_A0 >= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    112     extern int _mm_comige_sd(__m128d _A, __m128d _B);
    113     //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    114     extern int _mm_comineq_sd(__m128d _A, __m128d _B);
    115     //返回一个0或1的整数,r=(_A0 == _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    116     extern int _mm_ucomieq_sd(__m128d _A, __m128d _B);
    117     //返回一个0或1的整数,r=(_A0 < _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    118     extern int _mm_ucomilt_sd(__m128d _A, __m128d _B);
    119     //返回一个0或1的整数,r=(_A0 <= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned
    120     extern int _mm_ucomile_sd(__m128d _A, __m128d _B);
    121     //返回一个0或1的整数,r=(_A0 > _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    122     extern int _mm_ucomigt_sd(__m128d _A, __m128d _B);
    123     //返回一个0或1的整数,r=(_A0 >= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    124     extern int _mm_ucomige_sd(__m128d _A, __m128d _B);
    125     //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned
    126     extern int _mm_ucomineq_sd(__m128d _A, __m128d _B);
    127 
    128     //Conversion Operations
    129     //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1
    130     extern __m128d _mm_cvtepi32_pd(__m128i _A);
    131     //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=0x0, r3=0x0
    132     extern __m128i _mm_cvtpd_epi32(__m128d _A);
    133     //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=0x0, r3=0x0,using truncate
    134     extern __m128i _mm_cvttpd_epi32(__m128d _A);
    135     //返回一个__m128的寄存器,r0=(flaot)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3
    136     extern __m128 _mm_cvtepi32_ps(__m128i _A);
    137     //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=(int)_A2, r3=(int)_A3
    138     extern __m128i _mm_cvtps_epi32(__m128 _A);
    139     //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=(int)_A2, r3=(int)_A3,using truncate
    140     extern __m128i _mm_cvttps_epi32(__m128 _A);
    141     //返回一个__m128的寄存器,r0=(flaot)_A0, r1=(float)_A1, r2=0.0, r3=0.0
    142     extern __m128 _mm_cvtpd_ps(__m128d _A);
    143     //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1
    144     extern __m128d _mm_cvtps_pd(__m128 _A);
    145     //返回一个__m128的寄存器,r0=(float)_B0, r1=_B1, r2=_B2, r3=_B3
    146     extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B);
    147     //返回一个__m128d的寄存器,r0=(double)_B0, r1=_A1
    148     extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B);
    149     //返回一个32bit整数,r=(int)_A0
    150     extern int _mm_cvtsd_si32(__m128d _A);
    151     //返回一个32bit整数,r=(int)_A0,using truncate
    152     extern int _mm_cvttsd_si32(__m128d _A);
    153     //返回一个__m128d的寄存器,r0=(double)_B, r1=_A1
    154     extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B);
    155     //返回一个__m64的寄存器,r0=(int)_A0, r1=(int)_A1
    156     extern __m64 _mm_cvtpd_pi32(__m128d _A);
    157     //返回一个__m64的寄存器,r0=(int)_A0, r1=(int)_A1,using truncate
    158     extern __m64 _mm_cvttpd_pi32(__m128d _A);
    159     //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1
    160     extern __m128d _mm_cvtpi32_pd(__m64 _A);
    161 
    162     //Miscellaneous Operations(Floating-Point SSE2 Intrinsics)
    163     //返回一个__m128d的寄存器,r0=_A1, r1=_B1
    164     extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B);
    165     //返回一个__m128d的寄存器,r0=_A0, r1=_B0
    166     extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B);
    167     //返回一个2bit整数,r=sign(_A1) << 1 | sign(_A0)
    168     extern int _mm_movemask_pd(__m128d _A);
    169     //返回一个__m128d的寄存器,Selects two specific double-precision,
    170     // floating-point values from _A and _B, based on the mask _I,
    171     //The mask must be an immediate
    172     extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I);
    173 
    174     //Load Operations(Floating-Point SSE2 Intrinsics)
    175     //返回一个__m128d的寄存器,r0=_Dp[0], r1=_Dp[1], The address _Dp must be 16-byte aligned
    176     extern __m128d _mm_load_pd(double const*_Dp);
    177     //返回一个__m128d的寄存器,r0=*_Dp, r1=*_Dp, The address _Dp does not need
    178     //to be 16-byte aligned
    179     extern __m128d _mm_load1_pd(double const*_Dp);
    180     //返回一个__m128d的寄存器,r0=_Dp[1], r1=_Dp[0], The address _Dp must be 16-byte aligned
    181     extern __m128d _mm_loadr_pd(double const*_Dp);
    182     //返回一个__m128d的寄存器,r0=_Dp[0], r1=_Dp[1], The address _Dp does not 
    183     //need to be 16-byte aligned
    184     extern __m128d _mm_loadu_pd(double const*_Dp);
    185     //返回一个__m128d的寄存器,r0=*_Dp, r1=0.0, The address _Dp does not 
    186     //need to be 16-byte aligned
    187     extern __m128d _mm_load_sd(double const*_Dp);
    188     //返回一个__m128d的寄存器,r0=_A0, r1=*_Dp, The address _Dp does not 
    189     //need to be 16-byte aligned
    190     extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp);
    191     //返回一个__m128d的寄存器,r0=*_Dp, r1=_A1, The address _Dp does not
    192     //need to be 16-byte aligned
    193     extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp);
    194 
    195     //Set Operations(Floating-Point SSE2 Intrinsics)
    196     //返回一个__m128d的寄存器,r0=_W, r1=0.0
    197     extern __m128d _mm_set_sd(double _W);
    198     //返回一个__m128d的寄存器,r0=_A, r1=_A
    199     extern __m128d _mm_set1_pd(double _A);
    200     //返回一个__m128d的寄存器,r0=_Y, r1=_Z
    201     extern __m128d _mm_set_pd(double _Z, double _Y);
    202     //返回一个__m128d的寄存器,r0=_Y, r1=_Z
    203     extern __m128d _mm_setr_pd(double _Y, double _Z);
    204     //返回一个__m128d的寄存器,r0=0.0, r1=0.0
    205     extern __m128d _mm_setzero_pd(void);
    206     //返回一个__m128d的寄存器,r0=_B0, r1=_A1
    207     extern __m128d _mm_move_sd(__m128d _A, __m128d _B);
    208 
    209     //Store Operations(Floating-Point SSE2 Intrinsics)
    210     //返回为空,*_Dp=_A0, The address _Dp does not need to be 16-byte aligned
    211     extern void _mm_store_sd(double *_Dp, __m128d _A);
    212     //返回为空,_Dp[0]=_A0, _Dp[1]=_A0, The address _Dp must be 16-byte aligned
    213     extern void _mm_store1_pd(double *_Dp, __m128d _A);
    214     //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, The address _Dp must be 16-byte aligned
    215     extern void _mm_store_pd(double *_Dp, __m128d _A);
    216     //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, The address _Dp does not need to be 16-byte aligned
    217     extern void _mm_storeu_pd(double *_Dp, __m128d _A);
    218     //返回为空,_Dp[0]=_A1, _Dp[1]=_A0, The address _Dp must be 16-byte aligned
    219     extern void _mm_storer_pd(double *_Dp, __m128d _A);
    220     //返回为空,*_Dp=_A1
    221     extern void _mm_storeh_pd(double *_Dp, __m128d _A);
    222     //返回为空,*_Dp=_A0
    223     extern void _mm_storel_pd(double *_Dp, __m128d _A);
    224 
    225     //new convert to float
    226     //返回一个64bit double类型,r=_A0, Extracts the lower order floating point value
    227     extern double _mm_cvtsd_f64(__m128d _A);
    228 
    229     //Cache Support for Streaming SIMD Extensions 2 Floating-Point Operations
    230     //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, Stores the data in _A to the address _Dp without
    231     //polluting caches. The address _Dp must be 16-byte aligned. If the cache line 
    232     //containing address _Dp is already in the cache, the cache will be updated
    233     extern void _mm_stream_pd(double *_Dp, __m128d _A);
    234 
    235     /*------------Integer Intrinsics Using Streaming SIMD Extensions 2-------------*/
    236     //Arithmetic Operations(Integer SSE2 Intrinsics):add、sub、mul、avg、min、max
    237     //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1, ... r15=_A15+_B15
    238     extern __m128i _mm_add_epi8(__m128i _A, __m128i _B);
    239     //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加,
    240     //即ri=_Ai+_Bi(r0=_A0+_B0, r1=_A1+_B1, ... r7=_A7+_B7)
    241     extern __m128i _mm_add_epi16(__m128i _A, __m128i _B);
    242     //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1, r2=_A2+_B2, r3=_A3+_B3
    243     extern __m128i _mm_add_epi32(__m128i _A, __m128i _B);
    244     //返回一个__m64的寄存器,r=_A+_B
    245     extern __m64 _mm_add_si64(__m64 _A, __m64 _B);
    246     //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1
    247     extern __m128i _mm_add_epi64(__m128i _A, __m128i _B);
    248     //返回一个__m128i的寄存器,r0=SignedSaturate(_A0+_B0), r1=SignedSaturate(_A1+_B1), ... 
    249     //r15=SignedSaturate(_A15+_B15), saturates
    250     extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B);
    251     //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加,
    252     //r0=SignedSaturate(_A0+_B0), r1=SignedSaturate(_A1+_B1), ... 
    253     //r7=SignedSaturate(_A7+_B7), 当计算结果溢出时将其置为边界值(saturates)
    254     extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B);
    255     //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0+_B0), r1=UnsignedSaturate(_A1+_B1), ... 
    256     //r15=UnsignedSaturate(_A15+_B15), saturates
    257     extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B);
    258     //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0+_B0), r1=UnsignedSaturate(_A1+_B1), ... 
    259     //r7=UnsignedSaturate(_A7+_B7), saturates
    260     extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B);
    261     //返回一个__m128i的寄存器,r0=(_A0+_B0)/2, r1=(_A1+_B1)/2, ... r15=(_A15+_B15)/2, rounds
    262     extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B); 
    263     //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit无符号整数取平均,
    264     //即ri=(_Ai+_Bi)/2(r0=(_A0+_B0)/2, r1=(_A1+_B1)/2, ... r7=(_A7+_B7)/2), rounds
    265     extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B);
    266     //返回一个__m128i的寄存器,它含有4个有符号或无符号32bit的整数,
    267     //分别满足:r0=(_A0*_B0)+(_A1*_B1), r1=(_A2*_B2)+(_A3*_B3), 
    268     //r2=(_A4*_B4)+(_A5*_B5), r3=(_A6*_B6)+(_A7*_B7)
    269     extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B);
    270     //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最大值,
    271     //即ri=max(_Ai,_Bi) (r0=max(_A0,_B1), r1=max(_A1,_B1), ... r7=max(_A7,_B7))
    272     extern __m128i _mm_max_epi16(__m128i _A, __m128i _B);
    273     //返回一个__m128i的寄存器,r0=max(_A0,_B1), r1=max(_A1,_B1), ... r15=max(_A15,_B15)
    274     extern __m128i _mm_max_epu8(__m128i _A, __m128i _B);
    275     //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最小值,
    276     //即ri=min(_Ai, _Bi)(r0=min(_A0,_B1), r1=min(_A1,_B1), ... r7=min(_A7,_B7))
    277     extern __m128i _mm_min_epi16(__m128i _A, __m128i _B);
    278     //返回一个__m128i的寄存器,r0=min(_A0,_B1), r1=min(_A1,_B1), ... r15=min(_A15,_B15)
    279     extern __m128i _mm_min_epu8(__m128i _A, __m128i _B);
    280     //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit
    281     //有符号或无符号整数相乘结果的高16bit数据,即ri=(_Ai*_Bi)[31:16](r0=(_A0*_B0)[31:16], 
    282     //r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16])
    283     extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B);
    284     //返回一个__m128i的寄存器,r0=(_A0*_B0)[31:16], r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16]
    285     extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B);
    286     //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit
    287     //有符号或无符号整数相乘结果的低16bit数据,即ri=(_Ai*_Bi)[15:0](r0=(_A0*_B0)[15:0], 
    288     //r1=(_A1*_B1)[15:0] ... r7=(_A7*_B7)[15:0])
    289     extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B);
    290     //返回一个__m64的寄存器,r=_A0*_B0
    291     extern __m64 _mm_mul_su32(__m64 _A, __m64 _B);
    292     //返回一个__m128i的寄存器,r0=_A0*_B0, r1=_A2*_B2
    293     extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B);
    294     //返回一个__m128i的寄存器,r0=abs(_A0-_B0) + abs(_A1-_B1) + ... + abs(_A7-_B7), 
    295     //r1=0x0,r2=0x0, r3=0x0, r4=abs(_A8-_B8) + abs(_A9-_B9) + ... + abs(_A15-_B15), 
    296     //r5=0x0, r6=0x0, r7=0x0
    297     extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B);
    298     //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1, ... r15=_A15-_B15
    299     extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B);
    300     //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减,
    301     //即ri=_Ai-_Bi(r0=_A0-_B0, r1=_A1-_B1, ... r7=_A7-_B7)
    302     extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B);
    303     //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1, r2=_A2-_B2, r3=_A3-_B3
    304     extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B);
    305     //返回一个__m64的寄存器,r=_A-_B
    306     extern __m64 _mm_sub_si64(__m64 _A, __m64 _B);
    307     //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1
    308     extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B);
    309     //返回一个__m128i的寄存器,r0=SignedSaturate(_A0-_B0), r1=SignedSaturate(_A1-_B1), ... 
    310     //r15=SignedSaturate(_A15-_B15), saturate
    311     extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B);
    312     //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减,
    313     //当计算结果溢出时将其置为边界值(saturate), r0=SignedSaturate(_A0-_B0), 
    314     //r1=SignedSaturate(_A1-_B1), ... r7=SignedSaturate(_A7-_B7)
    315     extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B);
    316     //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0-_B0), r1=UnsignedSaturate(_A1-_B1), ...
    317     //r15=UnsignedSaturate(_A15-_B15), saturate
    318     extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B);
    319     //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0-_B0), r1=UnsignedSaturate(_A1-_B1), ... 
    320     //r15=UnsignedSaturate(_A7-_B7), saturate
    321     extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B);
    322 
    323     //Logical Operations(Integer SSE2 Intrinsics):and、or、xor、andnot
    324     //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位与运算, r=_A & _B
    325     extern __m128i _mm_and_si128(__m128i _A, __m128i _B);
    326     //返回一个__m128i的寄存器,将寄存器_A每一位取非,然后和寄存器_B的每一位进行按位与运算,
    327     //r=(~_A) & _B
    328     extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B);
    329     //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位或运算, r=_A | _B
    330     extern __m128i _mm_or_si128(__m128i _A, __m128i _B);
    331     //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位异或运算, r=_A ^ _B
    332     extern __m128i _mm_xor_si128(__m128i _A, __m128i _B);
    333 
    334     //Shift Operations
    335     //返回一个__m128i的寄存器,r=_A << (_Imm * 8),  _Imm must be an immediate,  
    336     //shifting in zeros
    337     extern __m128i _mm_slli_si128(__m128i _A, int _Imm);
    338     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑左移,
    339     //r0=_A0 << _Count, r1=_A1 << _Count, ... r7=_A7 << count,  shifting in zeros
    340     extern __m128i _mm_slli_epi16(__m128i _A, int _Count);
    341     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数
    342     //进行逻辑左移, r0=_A0 << _Count, r1=_A1 << _Count, ... r7=_A7 << count,  shifting in zeros
    343     extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count);
    344     //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, r2=_A2 << count, 
    345     //r3=_A3 << count,  shifting in zeros
    346     extern __m128i _mm_slli_epi32(__m128i _A, int _Count);
    347     //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, r2=_A2 << count, 
    348     //r3=_A3 << count,  shifting in zeros
    349     extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count);
    350     //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count,  shifting in zeros
    351     extern __m128i _mm_slli_epi64(__m128i _A, int _Count);
    352     //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count,  shifting in zeros
    353     extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count);
    354     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的算术右移,
    355     //r0=_A0 >> _Count, r1=_A1 >> _Count, ... r7=_A7 >> count,  shifting in the sign bit
    356     extern __m128i _mm_srai_epi16(__m128i _A, int _Count);
    357     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数进行
    358     //算术右移,r0=_A0 >> _Count, r1=_A1 >> _Count, ... r7=_A7 >> count,  shifting in the sign bit
    359     extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count);
    360     //返回一个__m128i的寄存器,r0=_A0 >> _Count, r1=_A1 >> _Count, r3=_A3 >> count, 
    361     //r4=_A4 >> count,  shifting in the sign bit
    362     extern __m128i _mm_srai_epi32(__m128i _A, int _Count);
    363     //返回一个__m128i的寄存器,r0=_A0 >> _Count, r1=_A1 >> _Count, r3=_A3 >> count,
    364     //r4=_A4 >> count,  shifting in the sign bit
    365     extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count);
    366     //返回一个__m128i的寄存器,r=srl(_A, _Imm * 8),   _Imm must be an immediate,  
    367     //shifting in zeros
    368     extern __m128i _mm_srli_si128(__m128i _A, int _Imm);
    369     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑右移,
    370     //移位填充值为0,r0=srl(_A0, _Count), r1=srl(_A1, _Count), ... r7=srl(_A7, _Count), 
    371     //shifting in zeros
    372     extern __m128i _mm_srli_epi16(__m128i _A, int _Count);
    373     //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数
    374     //进行逻辑右移,移位填充值为0, r0=srl(_A0, _Count), r1=srl(_A1, _Count), ... 
    375     //r7=srl(_A7, _Count),  shifting in zeros
    376     extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count);
    377     //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), r2=srl(_A2, _Count),
    378     //r3=srl(_A3, _Count),  shifting in zeros
    379     extern __m128i _mm_srli_epi32(__m128i _A, int _Count);
    380     //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), r2=srl(_A2, _Count),
    381     //r3=srl(_A3, _Count),  shifting in zeros
    382     extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count);
    383     //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), shifting in zeros
    384     extern __m128i _mm_srli_epi64(__m128i _A, int _Count);
    385     //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), shifting in zeros
    386     extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count);
    387 
    388     //Comparison Intrinsics(SSE2):==、>、<
    389     //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xff : 0x00, 
    390     //r1=(_A1 == _B1) ? 0xff : 0x0, ... r15=(_A15 == _B15) ? 0xff : 0x0
    391     extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B);
    392     //返回一个__m128i的寄存器,分别比较寄存器_A和寄存器_B对应位置16bit整数是否相等,若相等,
    393     //该位置返回0xffff,否则返回0x0,即ri=(_Ai==_Bi)?0xffff:0x0(r0=(_A0 == _B0) ? 0xffff : 0x00, 
    394     //r1=(_A1 == _B1) ? 0xffff : 0x0, ... r7=(_A7 == _B7) ? 0xffff : 0x0)
    395     extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B);
    396     //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xffffffff : 0x00, 
    397     //r1=(_A1 == _B1) ? 0xffffffff : 0x0,
    398     //r2=(_A2 == _B2) ? 0xffffffff : 0x0, r3=(_A3 == _B3) ? 0xffffffff : 0x0
    399     extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B);
    400     //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xff : 0x00, r1=(_A1 > _B1) ? 0xff : 0x0, ...
    401     //r15=(_A15 > _B15) ? 0xff : 0x0
    402     extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B);
    403     //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否大于寄存器_B对应位置16bit的整数,
    404     //若大于,该位置返回0xffff,否则返回0x0,
    405     //即ri=(_Ai>_Bi)?0xffff:0x0(r0=(_A0 > _B0) ? 0xffff : 0x00, 
    406     //r1=(_A1 > _B1) ? 0xffff : 0x0, ... r7=(_A7 > _B7) ? 0xffff : 0x0)
    407     extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B);
    408     //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xffffffff : 0x00,
    409     //r1=(_A1 > _B1) ? 0xffffffff : 0x0,
    410     //r2=(_A2 > _B2) ? 0xffffffff : 0x0, r3=(_A3 > _B3) ? 0xffffffff : 0x0
    411     extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B);
    412     //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xff : 0x00, r1=(_A1 < _B1) ? 0xff : 0x0, ... 
    413     //r15=(_A15 < _B15) ? 0xff : 0x0
    414     extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B);
    415     //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否小于寄存器_B对应位置16bit整数,
    416     //若小于,该位置返回0xffff,否则返回0x0,
    417     //即ri=(_Ai<_Bi)?0xffff:0x0(r0=(_A0 < _B0) ? 0xffff : 0x00, 
    418     //r1=(_A1 < _B1) ? 0xffff : 0x0, ... r7=(_A7 < _B7) ? 0xffff : 0x0)
    419     extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B);
    420     //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xffffffff : 0x00,
    421     //r1=(_A1 < _B1) ? 0xffffffff : 0x0, 
    422     //r2=(_A2 < _B2) ? 0xffffffff : 0x0, r3=(_A3 < _B3) ? 0xffffffff : 0x0
    423     extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B);
    424 
    425     //Conversion Intrinsics: int <-----> __m128i
    426     //返回一个__m128i的寄存器,r0=_A, r1=0x0, r2=0x0, r3=0x0
    427     extern __m128i _mm_cvtsi32_si128(int _A);
    428     //返回一个32bit整数,r=_A0
    429     extern int _mm_cvtsi128_si32(__m128i _A);
    430 
    431     //Miscellaneous Operations(Integer SSE2 Intrinsics)
    432     //返回一个__m128i的寄存器,r0=SignedSaturate(_A0), r1=SignedSaturate(_A1), ... 
    433     //r7=SignedSaturate(_A7), r8=SignedSaturate(_B0), r9=SignedSaturate(_B1), ... 
    434     //r15=SignedSaturate(_B7),  saturate
    435     extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B);
    436     //返回一个__m128i的寄存器,r0=SignedSaturate(_A0), r1=SignedSaturate(_A1), 
    437     //r2=SignedSaturate(_A2),r3=SignedSaturate(_A3), r4=SignedSaturate(_B0), 
    438     //r5=SignedSaturate(_B1), r6=SignedSaturate(_B2), r7=SignedSaturate(_B3),  saturate
    439     extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B);
    440     //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0), r1=UnsignedSaturate(_A1), ... 
    441     //r7=UnsignedSaturate(_A7),r8=UnsignedSaturate(_B0), r9=UnsignedSaturate(_B1), ... 
    442     //r15=UnsignedSaturate(_B7),  saturate
    443     extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B);
    444     //返回一个16bit整数,根据_Imm从_A中8个16bit数中选取对应编号的数,
    445     //r=(_Imm == 0) ? _A0 : ((_Imm == 1) ? _A1 : ... (_Imm == 7) ? _A7), 
    446     //_Imm must be an immediate, zero extends
    447     extern int _mm_extract_epi16(__m128i _A, int _Imm);
    448     //返回一个__m128i的寄存器,根据_Imm将_A中8个16bit数中对应编号的数替换为_B,
    449     //r0=(_Imm == 0) ? _B : _A0; r1=(_Imm == 1) : _B : _A1, ... r7=(_Imm == 7) ? _B : _A7
    450     extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm);
    451     //返回一个16bit整数,r=(_A15[7] << 15) | (_A14[7] << 14) ... (_A1[7] << 1) | _A0[7], 
    452     //zero extends the upper bits
    453     extern int _mm_movemask_epi8(__m128i _A);
    454     //返回一个__m128i的寄存器,它是将_A中128bit数据以32bit为单位重新排列得到的,_Imm为有
    455     //一个四元组,表示重新排列的顺序。当_A中原本存储的整数为16bit时,这条指令将其两两一组
    456     //进行排列。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7), _Imm=(2,3,0,1),其中_Ai为16bit整数,
    457     //_A0为低位,返回结果为(_A2,_A3,_A0,_A1,_A6,_A7,_A4,_A5),  _Imm must be an immediate
    458     extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm);
    459     //返回一个__m128i的寄存器,它是将_A中高64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组,
    460     //表示重新排列的顺序。_A中低64bit数据顺序不变。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7), 
    461     //_Imm=(2,3,0,1),其中_Ai为16bit整数,_A0为低位,返回结果为(_A0,_A1,_A2,_A3,_A5,_A4,_A7,_A6), 
    462     //_Imm must be an immediate 
    463     extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm);
    464     //返回一个__m128i的寄存器,它是将_A中低64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组,
    465     //表示重新排列的顺序。_A中高64bit数据顺序不变。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),
    466     //_Imm=(2,3,0,1),其中_Ai为16bit整数,_A0为低位,返回结果为(_A1,_A0,_A3,_A2,_A5,_A4,_A7,_A6),   
    467     //_Imm must be an immediate
    468     extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm);
    469     //返回一个__m128i的寄存器,r0=_A8, r1=_B8, r2=_A9, r3=_B9, ... r14=_A15, r15=_B15
    470     extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B);
    471     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以16bit为单位交织在一块。
    472     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    473     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A4,_B4,_A5,_B5,_A6,_B6,_A7,_B7),
    474     //r0=_A4, r1=_B4, r2=_A5, r3=_B5, r4=_A6, r5=_B6, r6=_A7, r7=_B7
    475     extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B);
    476     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以32bit为单位交织在一块。
    477     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    478     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A4,_A5,_B4,_B5,_A6,_A7,_B6,_B7),
    479     //r0=_A2, r1=_B2, r2=_A3, r3=_B3
    480     extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B);
    481     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以64bit为单位交织在一块。
    482     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    483     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,
    484     //返回结果为(_A4,_A5,_A6,_A7,_B4,_B5,_B6,_B7), r0=_A1, r1=_B1
    485     extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B);
    486     //返回一个__m128i的寄存器,r0=_A0, r1=_B0, r2=_A1, r3=_B1, ... r14=_A7, r15=_B7
    487     extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B);
    488     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以16bit为单位交织在一块。
    489     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    490     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_B0,_A1,_B1,_A2,_B2,_A3,_B3),
    491     //r0=_A0, r1=_B0, r2=_A1, r3=_B1, r4=_A2, r5=_B2, r6=_A3, r7=_B3
    492     extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B);
    493     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以32bit为单位交织在一块。
    494     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    495     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_A1,_B0,_B1,_A2,_A3,_B2,_B3),
    496     //r0=_A0, r1=_B0, r2=_A1, r3=_B1
    497     extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B);
    498     //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以32bit为单位交织在一块。
    499     //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),
    500     //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_A1,_A2,_A3,_B0,_B1,_B2,_B3), 
    501     //ro=_A0, r1=_B0
    502     extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B);
    503 
    504     //Load Operations(Integer SSE2 Intrinsics)
    505     //返回为一个__m128i的寄存器,它将_P指向的数据读到指定寄存器中,实际使用时,
    506     //_P一般是通过类型转换得到的, Address _P must be 16-byte aligned
    507     extern __m128i _mm_load_si128(__m128i const*_P);
    508     //返回一个__m128i的寄存器,Loads 128-bit value, Address _P does not need be 16-byte aligned
    509     extern __m128i _mm_loadu_si128(__m128i const*_P);
    510     //返回一个__m128i的寄存器,r0=*p[63:0], r1=0x0, zeroing the upper 64 bits of the result
    511     extern __m128i _mm_loadl_epi64(__m128i const*_P);
    512 
    513     //Set Operations(Integer SSE2 Intrinsics)
    514     //返回一个__m128i的寄存器,r0=_Q0, r1=_Q1
    515     extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0);
    516     //返回一个__m128i的寄存器,r0=_I0, r1=_I1, r2=_I2, r3=_I3
    517     extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0);
    518     //返回一个__m128i的寄存器,使用8个具体的short型数据来设置寄存器存放数据,
    519     //r0=_W0, r1=_W1, ... r7=_W7
    520     extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4, 
    521                                     short _W3, short _W2, short _W1, short _W0);
    522     //返回一个__m128i的寄存器,r0=_B0, r1=_B1, ... r15=_B15
    523     extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, 
    524                     char _B10, char _B9,char _B8, char _B7, char _B6, char _B5, char _B4, 
    525                     char _B3, char _B2, char _B1, char _B0);
    526     //返回一个__m128i的寄存器,r0=_Q, r1=_Q
    527     extern __m128i _mm_set1_epi64(__m64 _Q);
    528     //返回一个__m128i的寄存器,r0=_I, r1=_I, r2=_I, r3=_I
    529     extern __m128i _mm_set1_epi32(int _I);
    530     //返回一个__m128i的寄存器,r0=_W, r1=_W, ... r7=_W
    531     extern __m128i _mm_set1_epi16(short _W);
    532     //返回一个__m128i的寄存器,r0=_B, r1=_B, ... r15=_B
    533     extern __m128i _mm_set1_epi8(char _B);
    534     //返回一个__m128i的寄存器,r=_Q
    535     extern __m128i _mm_setl_epi64(__m128i _Q);
    536     //返回一个__m128i的寄存器,r0=_Q0, r1=_Q1
    537     extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1);
    538     //返回一个__m128i的寄存器,r0=_I0, r1=_I1, r2=_I2, r3=_I3
    539     extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3);
    540     //返回一个__m128i的寄存器,r0=_W0, r1=_W1, ... r7=_W7
    541     extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3, 
    542                                     short _W4, short _W5, short _W6, short _W7);
    543     //返回一个__m128i的寄存器,r0=_B15, r1=_B14, ... r15=_B0
    544     extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, 
    545         char _B10, char _B9, char _B8, char _B7, char _B6, char _B5, char _B4,  
    546         char _B3, char _B2, char _B1, char _B0);
    547     //返回一个__m128i的寄存器,r=0x0
    548     extern __m128i _mm_setzero_si128(void);
    549 
    550     //Store Operations(Integer SSE2 Intrinsics)
    551     //返回为空,它将寄存器_B中的数据存储到_P指向的地址中,实际使用时,
    552     //_P一般是通过类型转换得到的, *_P = _B, Address _P must be 16-byte aligned
    553     extern void _mm_store_si128(__m128i *_P, __m128i _B);
    554     //返回为空,*_P=_B, Address _P does not need to be 16-byte aligned
    555     extern void _mm_storeu_si128(__m128i *_P, __m128i _B);
    556     //返回为空,*_P[63:0] =_Q0, lower 64 bits
    557     extern void _mm_storel_epi64(__m128i *_P, __m128i _Q);
    558     //返回为空,if(_N0[7]) _P[0]=_D0, if(_N1[7]) _P[1]=_D1, ... if(_N15[7]) _P[15]=_D15, 
    559     //The high bit of each byte in the selector _N determines whether the corresponding byte 
    560     //in _D will be stored. Address _P does not need to be 16-byte aligned
    561     extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P);
    562 
    563     //Integer, moves
    564     //返回一个__m128i的寄存器,r0=_Q0, r1=0x0, zeroing the upper bits
    565     extern __m128i _mm_move_epi64(__m128i _Q);
    566     //返回一个__m128i的寄存器,r0=_Q, r1=0x0, zeroing the upper bits
    567     extern __m128i _mm_movpi64_epi64(__m64 _Q);
    568     //返回一个__m64的寄存器,r=_Q0
    569     extern __m64 _mm_movepi64_pi64(__m128i _Q);
    570 
    571     //Cache Support for Steaming SIMD Extensions 2 Integer Operations
    572     //返回为空,*_P=_A, Stores the data in _A to the address _P without polluting the caches.
    573     //If the cache line containing address _P is already in the cache, the cache will be updated. 
    574     //Address _P must be 16-byte aligned
    575     extern void _mm_stream_si128(__m128i *_P, __m128i _A);
    576     //返回为空,Cache line containing _P is flushed and invalidated from 
    577     //all caches in the coherency domain
    578     extern void _mm_clflush(void const*_P);
    579     //返回为空,Guarantees that every load instruction that precedes, in program order, the load 
    580     //fence instruction is globally visible before any load instruction 
    581     //that follows the fence in program order
    582     extern void _mm_lfence(void);
    583     //返回为空,Guarantees that every memory access that precedes, in program order, 
    584     //the memory fence instruction is globally visible before any memory instruction 
    585     //that follows the fence in program order
    586     extern void _mm_mfence(void);
    587     //返回为空,*_P=_I, Stores the data in _I to the address _P without polluting the caches. 
    588     //If the cache line containing address _P is already in the cache, the cache will be updated
    589     extern void _mm_stream_si32(int *_P, int _I);
    590     //返回为空,The execution of the next instruction is delayed an implementation specific 
    591     //amount of time. The instruction does not modify the architectural state. This intrinsic
    592     //provides especially significant performance gain
    593     extern void _mm_pause(void);
    594 
    595     /*---Support for casting between various SP, DP, INT vector types. Note that these do no 
    596         conversion of values, they just change the type----*/
    597     //返回一个__m128的寄存器,Applies a type cast to reinterpret two 64-bit floating 
    598     //point values passed in as a 128-bit parameter as packed 32-bit floating point values
    599     extern __m128  _mm_castpd_ps(__m128d);
    600     //返回一个__m128i的寄存器,Applies a type cast to reinterpret two 64-bit
    601     //floating point values passed in as a 128-bit parameter as packed 32-bit integers
    602     extern __m128i _mm_castpd_si128(__m128d);
    603     //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit floating 
    604     //point values passed in as a 128-bit parameter as packed 64-bit floating point values
    605     extern __m128d _mm_castps_pd(__m128);
    606     //返回一个__m128i的寄存器,Applies a type cast to reinterpret four 32-bit floating 
    607     //point values passed in as a 128-bit parameter as packed 32-bit integers
    608     extern __m128i _mm_castps_si128(__m128);
    609     //返回一个__m128的寄存器,Applies a type cast to reinterpret four 32-bit integers 
    610     //passed in as a 128-bit parameter as packed 32-bit floating point values
    611     extern __m128  _mm_castsi128_ps(__m128i);
    612     //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit 
    613     //integers passed in as a 128-bit parameter as packed 64-bit floating point values
    614     extern __m128d _mm_castsi128_pd(__m128i);

    reference:

    http://blog.csdn.net/fengbingchun/article/details/18460199

  • 相关阅读:
    .Net培训个人总结笔记28
    .Net培训个人总结笔记32
    .Net培训个人总结笔记21
    Codeforces Round #751 (Div. 1 & 2) Solutions
    「CEOI2017」Mousetrap 题解
    「CQOI2017」小Q的表格 题解
    mysql 非数字字段注意事项
    认识key及新鲜应用模式.
    B2C的革命: QQ网购
    php语言流程控制中的主动与被动.
  • 原文地址:https://www.cnblogs.com/galoishelley/p/4033254.html
Copyright © 2020-2023  润新知