• 修改ncnn的openmp异步处理方法 附C++样例代码


    ncnn刚发布不久,博主在ios下尝试编译。

    遇上了openmp的编译问题。

    寻找各种解决方案无果,亲自操刀。

    采用std::thread 替换 openmp。

    ncnn项目地址:

    https://github.com/Tencent/ncnn

    后来询问ncnn的作者才知道在ios下的编译方法。

    至此,当时的临时方案 采用std::thread 替换 openmp。

    想想也许在一些特定情况下还是比较适用的,当前方便两者之间进行切换验证。

    抽空写了一个示例项目。

    项目地址:

    https://github.com/cpuimage/ParallelFor

    贴上完整代码:

    #include <stdio.h>
    #include <stdlib.h>   
    #include <iostream>
    
    
    #if defined(_OPENMP)
    // compile with: /openmp  
    #include <omp.h>
    auto const epoch = omp_get_wtime();
    double now() {
        return omp_get_wtime() - epoch;
    };
    #else 
    #include <chrono>
    auto const epoch = std::chrono::steady_clock::now();
    double now() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - epoch).count() / 1000.0;
    };
    #endif
    
    template<typename FN>
    double bench(const FN &fn) {
        auto took = -now();
        return (fn(), took + now());
    }
    
    #include <functional>
    
    #if defined(_OPENMP)
    #    include <omp.h>
    #else 
    #include <thread>
    
    #include <vector>
    #endif
    
    
    #ifdef _OPENMP
    static int processorCount = static_cast<int>(omp_get_num_procs());
    #else
    static int processorCount = static_cast<int>(std::thread::hardware_concurrency());
    #endif
    
    
    static void ParallelFor(int inclusiveFrom, int exclusiveTo, std::function<void(size_t)> func)
    {
    #if defined(_OPENMP)
    #pragma omp parallel for num_threads(processorCount)
        for (int i = inclusiveFrom; i < exclusiveTo; ++i)
        {
            func(i);
        }
        return;
    #else  
        if (inclusiveFrom >= exclusiveTo)
            return;
    
        static    size_t thread_cnt = 0;
        if (thread_cnt == 0)
        {
            thread_cnt = std::thread::hardware_concurrency();
        }
        size_t entry_per_thread = (exclusiveTo - inclusiveFrom) / thread_cnt;
    
        if (entry_per_thread < 1)
        {
            for (int i = inclusiveFrom; i < exclusiveTo; ++i)
            {
                func(i);
            }
            return;
        }
        std::vector<std::thread> threads;
        int start_idx, end_idx;
    
        for (start_idx = inclusiveFrom; start_idx < exclusiveTo; start_idx += entry_per_thread)
        {
            end_idx = start_idx + entry_per_thread;
            if (end_idx > exclusiveTo)
                end_idx = exclusiveTo;
    
            threads.emplace_back([&](size_t from, size_t to)
            {
                for (size_t entry_idx = from; entry_idx < to; ++entry_idx)
                    func(entry_idx);
            }, start_idx, end_idx);
        }
    
        for (auto& t : threads)
        {
            t.join();
        }
    #endif
    }
    
    
    
    void test_scale(int i, double* a, double* b) {
        a[i] = 4 * b[i];
    }
    
    int main()
    {
        int N = 10000;
        double* a2 = (double*)calloc(N, sizeof(double));
        double* a1 = (double*)calloc(N, sizeof(double));
        double* b = (double*)calloc(N, sizeof(double));
        if (a1 == NULL || a2 == NULL || b == NULL)
        {
            if (a1)
            {
                free(a1);
            }if (a2)
            {
                free(a2);
            }if (b)
            {
                free(b);
            }
            return -1;
        }
        for (int i = 0; i < N; i++)
        {
            a1[i] = i;
            a2[i] = i;
            b[i] = i;
        }
        double beforeTime = bench([&] {
            for (int i = 0; i < N; i++)
            {
                test_scale(i, a1, b);
            }
        });
    
        std::cout << " 
    before: " << int(beforeTime * 1000) << "ms" << std::endl;
        double afterTime = bench([&] {
            ParallelFor(0, N, [a2, b](size_t i)
            {
                test_scale(i, a2, b);
            });
        });
        std::cout << " 
    after: " << int(afterTime * 1000) << "ms" << std::endl;
    
        for (int i = 0; i < N; i++)
        {
            if (a1[i] != a2[i]) {
                printf("error %f : %f 	", a1[i], a2[i]);
                getchar();
            }
        }
        free(a1);
        free(a2);
        free(b);
        getchar();
        return 0;
    }

    要使用OPENMP,加个编译选项/openmp  或者定义一下 _OPENMP 即可。

    建议c++11编译。

    示例代码比较简单。

    ncnn代码修改例子如下:

       #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const Mat m = src.channel(q);
                Mat borderm = dst.channel(q);
    
                copy_make_border_image(m, borderm, top, left, type, v);
            }

     改为

        ParallelFor(0, channels, [&](int  q) {
                    {
                        const Mat m = src.channel(q);
                        Mat borderm = dst.channel(q);
    
                        copy_make_border_image(m, borderm, top, left, type, v);
                    }});

    本来计划抽点时间把ncnn整体都改一下,发个修改版本出来。

    想想还是把做法贴出来,给有需求的人吧。

    自己动手丰衣足食。

    若有其他相关问题或者需求也可以邮件联系俺探讨。

    邮箱地址是: 
    gaozhihan@vip.qq.com

  • 相关阅读:
    JS获取单选框checked的value方法
    URL链接后面的参数解析,与decode编码解码;页面刷新回到顶部jquery
    JS原生增删,判断class是否存在方法
    转载:jquery.ajax之beforeSend方法使用介绍
    css3 filter(滤镜)属性汇总与使用介绍,来源W3C
    使用 HTML5 Geolocation 构建基于地理位置的 Web 应用学习网站分享
    js获取移动端触摸坐标
    jquery如何获取手机网页触屏坐标:ontouchstart 、ontouchend、ontouchmove
    js/jquery获取浏览器窗口可视区域高度和宽度以及滚动条高度实现代码
    JS获取浏览器可视区域的尺寸
  • 原文地址:https://www.cnblogs.com/cpuimage/p/8379501.html
Copyright © 2020-2023  润新知