• Horovod Install


    Horovod documentation

    安装

    【Step1】安装Open MPI

    注意: Open MPI 3.1.3 安装有些问题, 可以安装 Open MPI 3.1.2 或者 Open MPI 4.0.0.

    【Step2】安装 TensorFlow

    • pip install tensorflow 确保 g++-4.8.5 或者 g++-4.9
    • 也可以用conda 安装

    【Step3】安装 horovod

    cpu

    pip install horovod
    

    GPUs with NCCL:

    $ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install horovod
    

    Docker 文档:
    https://horovod.readthedocs.io/en/stable/docker.html

    https://raw.githubusercontent.com/horovod/horovod/master/Dockerfile.cpu
    https://raw.githubusercontent.com/horovod/horovod/master/Dockerfile.gpu
    

    CPU-Dockerfile

    FROM ubuntu:18.04
    
    ENV TENSORFLOW_VERSION=2.1.0
    ENV PYTORCH_VERSION=1.4.0
    ENV TORCHVISION_VERSION=0.5.0
    ENV MXNET_VERSION=1.6.0
    
    # Python 3.6 is supported by Ubuntu Bionic out of the box
    ARG python=3.6
    ENV PYTHON_VERSION=${python}
    
    # Set default shell to /bin/bash
    SHELL ["/bin/bash", "-cu"]
    
    RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends 
            build-essential 
            cmake 
            g++-4.8 
            git 
            curl 
            vim 
            wget 
            ca-certificates 
            libjpeg-dev 
            libpng-dev 
            python${PYTHON_VERSION} 
            python${PYTHON_VERSION}-dev 
            python${PYTHON_VERSION}-distutils 
            librdmacm1 
            libibverbs1 
            ibverbs-providers
    
    RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
    
    RUN curl -O https://bootstrap.pypa.io/get-pip.py && 
        python get-pip.py && 
        rm get-pip.py
    
    # Install TensorFlow, Keras, PyTorch and MXNet
    RUN pip install future typing
    RUN pip install numpy 
            tensorflow==${TENSORFLOW_VERSION} 
            keras 
            h5py
    RUN pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
    RUN pip install mxnet==${MXNET_VERSION}
    
    # Install Open MPI
    RUN mkdir /tmp/openmpi && 
        cd /tmp/openmpi && 
        wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && 
        tar zxf openmpi-4.0.0.tar.gz && 
        cd openmpi-4.0.0 && 
        ./configure --enable-orterun-prefix-by-default && 
        make -j $(nproc) all && 
        make install && 
        ldconfig && 
        rm -rf /tmp/openmpi
    
    # Install Horovod
    RUN HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 
        pip install --no-cache-dir horovod
    
    # Install OpenSSH for MPI to communicate between containers
    RUN apt-get install -y --no-install-recommends openssh-client openssh-server && 
        mkdir -p /var/run/sshd
    
    # Allow OpenSSH to talk to containers without asking for confirmation
    RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && 
        echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && 
        mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
    
    # Download examples
    RUN apt-get install -y --no-install-recommends subversion && 
        svn checkout https://github.com/horovod/horovod/trunk/examples && 
        rm -rf /examples/.svn
    
    WORKDIR "/examples"
    

    GPU-Dockerfile

    FROM nvidia/cuda:10.1-devel-ubuntu18.04
    
    # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
    ENV TENSORFLOW_VERSION=2.1.0
    ENV PYTORCH_VERSION=1.4.0
    ENV TORCHVISION_VERSION=0.5.0
    ENV CUDNN_VERSION=7.6.5.32-1+cuda10.1
    ENV NCCL_VERSION=2.4.8-1+cuda10.1
    ENV MXNET_VERSION=1.6.0
    
    # Python 3.6 is supported by Ubuntu Bionic out of the box
    ARG python=3.6
    ENV PYTHON_VERSION=${python}
    
    # Set default shell to /bin/bash
    SHELL ["/bin/bash", "-cu"]
    
    RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends 
            build-essential 
            cmake 
            g++-4.8 
            git 
            curl 
            vim 
            wget 
            ca-certificates 
            libcudnn7=${CUDNN_VERSION} 
            libnccl2=${NCCL_VERSION} 
            libnccl-dev=${NCCL_VERSION} 
            libjpeg-dev 
            libpng-dev 
            python${PYTHON_VERSION} 
            python${PYTHON_VERSION}-dev 
            python${PYTHON_VERSION}-distutils 
            librdmacm1 
            libibverbs1 
            ibverbs-providers
    
    RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
    
    RUN curl -O https://bootstrap.pypa.io/get-pip.py && 
        python get-pip.py && 
        rm get-pip.py
    
    # Install TensorFlow, Keras, PyTorch and MXNet
    RUN pip install future typing
    RUN pip install numpy 
            tensorflow-gpu==${TENSORFLOW_VERSION} 
            keras 
            h5py
    
    RUN pip install https://download.pytorch.org/whl/cu101/torch-${PYTORCH_VERSION}-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported(None)[0][:-1]))")-linux_x86_64.whl 
            https://download.pytorch.org/whl/cu101/torchvision-${TORCHVISION_VERSION}-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported(None)[0][:-1]))")-linux_x86_64.whl
    RUN pip install mxnet-cu101==${MXNET_VERSION}
    
    # Install Open MPI
    RUN mkdir /tmp/openmpi && 
        cd /tmp/openmpi && 
        wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && 
        tar zxf openmpi-4.0.0.tar.gz && 
        cd openmpi-4.0.0 && 
        ./configure --enable-orterun-prefix-by-default && 
        make -j $(nproc) all && 
        make install && 
        ldconfig && 
        rm -rf /tmp/openmpi
    
    # Install Horovod, temporarily using CUDA stubs
    RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && 
        HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 
             pip install --no-cache-dir horovod && 
        ldconfig
    
    # Install OpenSSH for MPI to communicate between containers
    RUN apt-get install -y --no-install-recommends openssh-client openssh-server && 
        mkdir -p /var/run/sshd
    
    # Allow OpenSSH to talk to containers without asking for confirmation
    RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && 
        echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && 
        mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
    
    # Download examples
    RUN apt-get install -y --no-install-recommends subversion && 
        svn checkout https://github.com/horovod/horovod/trunk/examples && 
        rm -rf /examples/.svn
    
    WORKDIR "/examples"
    
  • 相关阅读:
    分享24款非常有用的 jQuery 插件
    分享30个最佳WordPress电子商务主题
    使用 CSS3 可以实现的五种很酷很炫的效果
    记录一些常用的python库、软件或者网址
    树的遍历
    深度优先遍历怎么抓住小偷
    hash算法的应用
    mysql的一些常用操作(一)
    Serverless 2.0,鸡蛋还是银弹?
    基于 KubeVela 与 Kubernetes 打造“无限能力”的开放 PaaS
  • 原文地址:https://www.cnblogs.com/shix0909/p/13390986.html
Copyright © 2020-2023  润新知