From d6c2ba1b8f0ad3e87d1f9d4db58195444579121a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Wed, 3 Mar 2021 22:19:27 +1300 Subject: [PATCH] Add Dockerfile.gpu for GPU support --- docker/Dockerfile | 14 ++- docker/Dockerfile.gpu | 195 ++++++++++++++++++++++++++++++++++++++ docker/README.md | 53 +++++++++++ docker/docker-compose.yml | 7 +- 4 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 docker/Dockerfile.gpu diff --git a/docker/Dockerfile b/docker/Dockerfile index 6d9ee91..955f6ca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,12 +2,20 @@ FROM continuumio/miniconda3:latest RUN apt-get update && apt-get upgrade -y \ && apt-get install -y \ - libpq-dev \ build-essential \ + cmake \ + ffmpeg \ git \ + libboost-all-dev \ + libjpeg-dev \ + libpq-dev \ + libsdl2-dev swig \ sudo \ - cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev libboost-all-dev libsdl2-dev swig \ - unzip zip \ + unzip \ + xorg-dev \ + xvfb \ + zip \ + zlib1g-dev \ && rm -rf /var/lib/apt/lists/* COPY environment.yml /tmp/ diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu new file mode 100644 index 0000000..a1e014e --- /dev/null +++ b/docker/Dockerfile.gpu @@ -0,0 +1,195 @@ +# This Dockerfile includes sections from tensorflow/tensorflow:latest-gpu's Dockerfile: +# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile +# and sections from continuumio/miniconda3:latest's Dockerfile: +# https://github.com/ContinuumIO/docker-images/blob/master/miniconda3/debian/Dockerfile + + +# First we need CUDA and everything else needed to support GPUs + +############################################### +#### FROM tensorflow/tensorflow:latest-gpu #### +############################################### +ARG UBUNTU_VERSION=18.04 + +ARG ARCH= +ARG CUDA=11.0 +FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base +# ARCH and CUDA are specified again because the FROM directive resets ARGs +# (but their default value is retained if set previously) +ARG ARCH +ARG CUDA +ARG CUDNN=8.0.4.30-1 +ARG CUDNN_MAJOR_VERSION=8 +ARG LIB_DIR_PREFIX=x86_64 +ARG LIBNVINFER=7.1.3-1 +ARG LIBNVINFER_MAJOR_VERSION=7 + +# Needed for string substitution +SHELL ["/bin/bash", "-c"] +# Pick up some TF dependencies +# [HOML2] Tweaked for handson-ml2: added all the libs before build-essentials +RUN apt-get update -q && apt-get install -q -y --no-install-recommends \ + bzip2 \ + ca-certificates \ + cmake \ + ffmpeg \ + git \ + libboost-all-dev \ + libglib2.0-0 \ + libjpeg-dev \ + libpq-dev \ + libsdl2-dev \ + libsm6 \ + libxext6 \ + libxrender1 \ + mercurial \ + subversion \ + sudo \ + swig \ + wget \ + xorg-dev \ + xvfb \ + zip \ + zlib1g-dev \ + build-essential \ + cuda-command-line-tools-${CUDA/./-} \ + libcublas-${CUDA/./-} \ + cuda-nvrtc-${CUDA/./-} \ + libcufft-${CUDA/./-} \ + libcurand-${CUDA/./-} \ + libcusolver-${CUDA/./-} \ + libcusparse-${CUDA/./-} \ + curl \ + libcudnn8=${CUDNN}+cuda${CUDA} \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip + +# Install TensorRT if not building for PowerPC +RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \ + apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ + libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; } + +# For CUDA profiling, TensorFlow requires CUPTI. +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + +# [HOML2] Tweaked for handson-ml2: removed Python3 & TensorFlow installation using pip + +################################################# +#### End of tensorflow/tensorflow:latest-gpu #### +################################################# + +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV PATH /opt/conda/bin:/opt/conda/envs/tf2/bin:$PATH + +# Next we need to install miniconda + +############################################ +#### FROM continuumio/miniconda3:latest #### +############################################ + +# [HOML2] Tweaked for handson-ml2: removed the beginning of the Dockerfile +CMD [ "/bin/bash" ] + +# Leave these args here to better use the Docker build cache +ARG CONDA_VERSION=py38_4.9.2 +ARG CONDA_MD5=122c8c9beb51e124ab32a0fa6426c656 + +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh -O miniconda.sh && \ + echo "${CONDA_MD5} miniconda.sh" > miniconda.md5 && \ + if ! md5sum --status -c miniconda.md5; then exit 1; fi && \ + mkdir -p /opt && \ + sh miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh miniconda.md5 && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc && \ + find /opt/conda/ -follow -type f -name '*.a' -delete && \ + find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ + /opt/conda/bin/conda clean -afy + +############################################## +#### End of continuumio/miniconda3:latest #### +############################################## + +# Now we're ready to create our conda environment + +COPY environment.yml /tmp/ +RUN conda update -y -n base conda \ + && echo ' - pyvirtualdisplay' >> /tmp/environment.yml \ + && conda env create -f /tmp/environment.yml \ + && conda clean -y -t \ + && rm /tmp/environment.yml + +ARG username +ARG userid + +ARG home=/home/${username} +ARG workdir=${home}/handson-ml2 + +RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} + +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} + +USER ${username} +WORKDIR ${workdir} + + +# The config below enables diffing notebooks with nbdiff (and nbdiff support +# in git diff command) after connecting to the container by "make exec" (or +# "docker-compose exec handson-ml2 bash") +# You may also try running: +# nbdiff NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the +# given notebook. + +RUN git-nbdiffdriver config --enable --global + +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc \ + && echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc \ + && sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py + + +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' + + +# INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) +# That will switch Jupyter to using empty password instead of a token. +# To avoid making a security hole you SHOULD in fact not only uncomment but +# regenerate the hash for your own non-empty password and replace the hash below. +# You can compute a password hash in any notebook, just run the code: +# from notebook.auth import passwd +# passwd() +# and take the hash from the output +#RUN mkdir -p ${home}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/README.md b/docker/README.md index eb5a558..3577212 100644 --- a/docker/README.md +++ b/docker/README.md @@ -71,3 +71,56 @@ You can see changes you made relative to the version in git using `git diff` whi You may also try `nbd NOTEBOOK_NAME.ipynb` command (custom, see bashrc file) to compare one of your notebooks with its `checkpointed` version.
To be precise, the output will tell you *what modifications should be re-played on the **manually saved** version of the notebook (located in `.ipynb_checkpoints` subdirectory) to update it to the **current** i.e. **auto-saved** version (given as command's argument - located in working directory)*. + +## GPU Support on Linux (experimental) + +If you're using Linux, and you have a TensorFlow-compatible GPU card (NVidia card with Compute Capability ≥ 3.5) that you would like TensorFlow to use inside the docker container, then you should download and install the latest driver for your card from [nvidia.com](https://www.nvidia.com/Download/index.aspx?lang=en-us). You will also need to install [NVidia Docker support](https://github.com/NVIDIA/nvidia-docker): if you are using Docker 19.03 or above, you must install the `nvidia-container-toolkit` package, and for earlier versions, you must install `nvidia-docker2`. + +To build the image, edit `docker-compose.yml`, replace the line `dockerfile: ./docker/Dockerfile` with `dockerfile: ./docker/Dockerfile.gpu`, and then run: + +```bash +$ cd /path/to/project/handson-ml2/docker +$ docker-compose build +``` + +To run the image, it's depends. If you have `docker-compose` version 1.28 or above, that's great! You can simply uncomment the `deploy` section in `docker-compose.yml`, and then run: + +```bash +$ cd /path/to/project/handson-ml2/docker +$ docker-compose up +[...] + or http://127.0.0.1:8888/?token=[...] +``` + +However, if you have an earlier version of `docker-compose`, it's simpler to use `docker run` directly. If you are using Docker 19.03 or above, you can run: + +```bash +$ cd /path/to/project/handson-ml2 +$ docker run --name handson-ml2 --gpus all -p 8888:8888 -p 6006:6006 --log-opt mode=non-blocking --log-opt max-buffer-size=50m -d -v `pwd`:/home/devel/handson-ml2 handson-ml2 /opt/conda/envs/tf2/bin/jupyter notebook --ip='0.0.0.0' --port=8888 --no-browser +``` + +If you are using an older version of Docker, then replace `--gpus all` with `--runtime=nvidia`. + +Then, display the container's logs and point your browser to the URL printed on the screen: + +```bash +$ docker logs handson-ml2 +[I 09:07:10.805 NotebookApp] Writing notebook server cookie secret to /home/devel/.local/share/jupyter/runtime/notebook_cookie_secret +[...] + or http://127.0.0.1:8888/?token=[...] +``` + +If everything goes well, Jupyter should appear, and if you open a notebook and execute the following code, it should show a GPU device in the list: + +```python +import tensorflow as tf + +tf.config.list_physical_devices() +``` + +Lastly, to stop and destroy the container (but not the image), run: + +```bash +$ docker stop handson-ml2 +$ docker rm handson-ml2 +``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 2cbefc5..25b8065 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -3,7 +3,7 @@ services: handson-ml2: build: context: ../ - dockerfile: ./docker/Dockerfile + dockerfile: ./docker/Dockerfile #Dockerfile.gpu args: - username=devel - userid=1000 @@ -20,3 +20,8 @@ services: volumes: - ../:/home/devel/handson-ml2 command: /opt/conda/envs/tf2/bin/jupyter notebook --ip='0.0.0.0' --port=8888 --no-browser + #deploy: + # resources: + # reservations: + # devices: + # - capabilities: [gpu] \ No newline at end of file