• fast powf


    测试结果:

    sum (fast) in clock 1562
    sum (fast2) in clock 1407
    sum (fast3) in clock 3156
    sum in clock 7797
    Error is 1.512115
    Error2 is 0.030914
    Error3 is 0.001389

    #include <stdio.h>
    #include <xmmintrin.h>
    #define NOMINMAX
    #include <windows.h>
    #include <math.h>
    #include <time.h>
    
    /*
     * (c) Ian Stephenson
     *
     * ian@dctsystems.co.uk
     *
     * Fast pow() reference implementation
     */
    
    /*
     * http://www.dctsystems.co.uk/Software/power.html
     * http://www.dctsystems.co.uk/Software/power.c
     */
    const float shift23=(1<<23);
    const float OOshift23=1.0/(1<<23);
    
    __forceinline float myFloorf(float a)
    {
        return (float)((int)a - (a < 0.0f));
    }
    
    __forceinline float myLog2(float i)
        {
        float LogBodge=0.346607f;
        float x;
        float y;
        x=(float)(*(int *)&i);
        x*= OOshift23; //1/pow(2,23);
        x=x-127;
    
        y=x-myFloorf(x);
        y=(y-y*y)*LogBodge;
        return x+y;
        }
    __forceinline float myPow2(float i)
        {
        float PowBodge=0.33971f;
        float x;
        float y=i-myFloorf(i);
        y=(y-y*y)*PowBodge;
    
        x=i+127-y;
        x*= shift23; //pow(2,23);
        *(int*)&x=(int)x;
        return x;
        }
    
    __forceinline float myPow(float a, float b)
        {
        return myPow2(b*myLog2(a));
        }
    
    ///////////////////////////////////////
    
    /* Code below are from http://code.google.com/p/fastapprox/ */
    __forceinline float fastpow2(float p)
    {
        float offset = (p < 0) ? 1.0f : 0.0f;
        float clipp = (p < -126) ? -126.0f : p;
        int w = (int)clipp;
        float z = clipp - w + offset;
        union { unsigned int i; float f; } v = { (unsigned int)((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
        return v.f;
    }
    
    __forceinline float fastlog2(float x)
    {
        union { float f; unsigned int i; } vx = { x };
        union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
        float y = (float)vx.i;
        y *= 1.1920928955078125e-7f;
        return y - 124.22551499f 
            - 1.498030302f * mx.f 
            - 1.72587999f / (0.3520887068f + mx.f);
    }
    
    __forceinline float fastpow(float x, float p)
    {
        return fastpow2(p * fastlog2(x));
    }
    
    /////////////////////////////////////////////////
    
    #define FLT_MIN        1.175494351e-38F
    #define FLT_MAX        3.402823466e+38F
    
    template <typename T>
    __forceinline T min(T a, T b)
    {
        return ((a < b) ? a : b);
    }
    
    __forceinline float fast_fabs(float x)
    {
        union { float f; unsigned int i; } v = {x};
        v.i &= 0x7FFFFFFF;
        return v.f;
    }
    
    /// Multiply and add: (a * b) + c
    template <typename T>
    __forceinline T madd (const T& a, const T& b, const T& c) {
        // NOTE:  in the future we may want to explicitly ask for a fused
        // multiply-add in a specialized version for float.
        // NOTE2: GCC/ICC will turn this (for float) into a FMA unless
        // explicitly asked not to, clang seems to leave the code alone.
        return a * b + c;
    }
    
    template <typename IN_TYPE, typename OUT_TYPE>
    __forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
        union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
        cvt.in_val = in;
        return cvt.out_val;
    }
    
    __forceinline float fast_log2 (float x) {
        // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
        if (x < FLT_MIN) x = FLT_MIN;
        if (x > FLT_MAX) x = FLT_MAX;
        // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
        unsigned bits = bit_cast<float, unsigned>(x);
        int exponent = int(bits >> 23) - 127;
        float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
        // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
        // ulp histogram:
        //  0  = 97.46%
        //  1  =  2.29%
        //  2  =  0.11%
        float f2 = f * f;
        float f4 = f2 * f2;
        float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
        float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
        hi = madd(f, hi, -0.13753123777116f);
        hi = madd(f, hi,  0.24187369696082f);
        hi = madd(f, hi, -0.34730547155299f);
        lo = madd(f, lo,  1.442689881667200f);
        return ((f4 * hi) + (f * lo)) + exponent;
    }
    
    __forceinline float fast_exp2 (float x) {
        // clamp to safe range for final addition
        if (x < -126.0f) x = -126.0f;
        if (x >  126.0f) x =  126.0f;
        // range reduction
        int m = int(x); x -= m;
        x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
        // 5th degree polynomial generated with sollya
        // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
        // ulp histogram:
        //  0  = 87.81%
        //  1  =  4.18%
        float r = 1.33336498402e-3f;
        r = madd(x, r, 9.810352697968e-3f);
        r = madd(x, r, 5.551834031939e-2f);
        r = madd(x, r, 0.2401793301105f);
        r = madd(x, r, 0.693144857883f);
        r = madd(x, r, 1.0f);
        // multiply by 2 ^ m by adding in the exponent
        // NOTE: left-shift of negative number is undefined behavior
        return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
    }
    
    __forceinline float fast_safe_pow (float x, float y) {
        if (y == 0) return 1.0f; // x^0=1
        if (x == 0) return 0.0f; // 0^y=0
        // be cheap & exact for special case of squaring and identity
        if (y == 1.0f)
            return x;
        if (y == 2.0f)
            return min (x*x, FLT_MAX);
        float sign = 1.0f;
        if (x < 0) {
            // if x is negative, only deal with integer powers
            // powf returns NaN for non-integers, we will return 0 instead
            int ybits = bit_cast<float, int>(y) & 0x7fffffff;
            if (ybits >= 0x4b800000) {
                // always even int, keep positive
            } else if (ybits >= 0x3f800000) {
                // bigger than 1, check
                int k = (ybits >> 23) - 127;  // get exponent
                int j =  ybits >> (23 - k);   // shift out possible fractional bits
                if ((j << (23 - k)) == ybits) // rebuild number and check for a match
                    sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
                else
                    return 0.0f; // not integer
            } else {
                return 0.0f; // not integer
            }
        }
        return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
    }
    
    /////////
    int main(int argc, char *argv[])
    {
        const int N = 100000000;
        float *buf = new float[N];
        float *a = new float[N];
        float *b = new float[N];
        float *c = new float[N];
        float *d = new float[N];
        for (int i = 0; i < N; ++i)
        {
            buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
        }
    
        int start_time;
    
    
        start_time = clock();
        for (int i = 0; i < N; ++i)
        {
            a[i] = myPow(buf[i], 0.8f);
        }
        printf("sum (fast) in clock %d
    ", clock() - start_time);
    
    
    
        start_time = clock();
        for (int i = 0; i < N; ++i)
        {
            c[i] = fastpow(buf[i], 0.8f);
        }
        printf("sum (fast2) in clock %d
    ", clock() - start_time);
    
    
    
        start_time = clock();
        for (int i = 0; i < N; ++i)
        {
            d[i] = fast_safe_pow(buf[i], 0.8f);
        }
        printf("sum (fast3) in clock %d
    ", clock() - start_time);
    
    
    
        start_time = clock();
        for (int i = 0; i < N; ++i)
        {
            b[i] = powf(buf[i], 0.8f);
        }
        printf("sum in clock %d
    ", clock() - start_time);
    
    
        float max_err = 0.0f;
        for (int i = 0; i < N; ++i)
        {
            float err = fabsf(a[i] - b[i]);
            if (err > max_err)
                max_err = err;
        }
        printf("Error is %f
    ", max_err);
    
    
        max_err = 0.0f;
        for (int i = 0; i < N; ++i)
        {
            float err = fabsf(b[i] - c[i]);
            if (err > max_err)
                max_err = err;
        }
        printf("Error2 is %f
    ", max_err);
    
    
        max_err = 0.0f;
        for (int i = 0; i < N; ++i)
        {
            float err = fabsf(b[i] - d[i]);
            if (err > max_err)
                max_err = err;
        }
        printf("Error3 is %f
    ", max_err);
    
    
        delete[]buf;
        delete[]a;
        delete[]b;
        delete[]c;
        delete[]d;
        return 0;
    }
  • 相关阅读:
    使用shell脚本对Linux系统和进程资源进行监控
    C++中string类的方法
    [leetcode] Trapping Rain Water
    Jenkins系列-Jenkins用户权限和角色配置
    ssh免秘钥登录
    Jenkins系列-Jenkins通过Publish over SSH插件实现远程部署
    Jenkins系列-Jenkins添加git密钥对
    Jenkins系列-Jenkins插件下载镜像加速
    Jenkins系列-Jenkins初始化配置
    Jenkins系列-Jenkins忘记密码的修复方法
  • 原文地址:https://www.cnblogs.com/len3d/p/7912175.html
Copyright © 2020-2023  润新知