From e7a1273c486d7629b889482e41743d956d2fac11 Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 06:09:45 +0100 Subject: [PATCH 01/15] Docker environment minutiae Docker compose project name set to avoid collisions, smiley dropped from README heading --- docker/.env | 1 + docker/README.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docker/.env diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..16adf41 --- /dev/null +++ b/docker/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=handson-ml diff --git a/docker/README.md b/docker/README.md index 50b6f12..2355c45 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,5 +1,5 @@ -# Hands-on Machine Learning in Docker :-) +# Hands-on Machine Learning in Docker This is the Docker configuration which allows you to run and tweak the book's notebooks without installing any dependencies on your machine!
OK, any except `docker`. With `docker-compose`. Well, you may also want `make` (but it is only used as thin layer to call a few simple `docker-compose` commands). From 8d16b3061d5ba3b5282190c13547f33819099ede Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 12:09:16 +0100 Subject: [PATCH 02/15] Patches to nbdiff for skipping noisy metadata, some local config Nbdiff --ignore-details skils autoscroll, collapsed, deletable, editable, toc (pull request on the way). Enabling empty pass, no git pager, ignoring gitdiff nbdiff details. --- docker/Dockerfile | 43 ++++++++++++++++++++++++----------- docker/bashrc | 4 ++-- docker/nbdime-1-details.patch | 17 ++++++++++++++ docker/nbdime-2-toc.patch | 11 +++++++++ 4 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 docker/nbdime-1-details.patch create mode 100644 docker/nbdime-2-toc.patch diff --git a/docker/Dockerfile b/docker/Dockerfile index 54e5510..6b2852e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,8 +21,10 @@ RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ ENV HOME /home/${username} -WORKDIR ${HOME}/handson-ml -RUN chown ${username}:${username} ${HOME}/handson-ml +ARG workdir=${HOME}/handson-ml + +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} USER ${username} @@ -30,7 +32,7 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main -# INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) +## INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) # That will switch jupyter to using empty password instead of a token. # To avoid making a security hole you SHOULD in fact not only uncomment but # regenerate the hash for your own non-empty password and replace the hash below. @@ -38,12 +40,12 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py +RUN mkdir -p ${HOME}/.jupyter && \ + echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ + >> ${HOME}/.jupyter/jupyter_notebook_config.py -# INFO: Uncomment the RUN command below to disable git diff paging -#RUN git config --global core.pager '' +## INFO: Uncomment the RUN command below to disable git diff paging +RUN git config --global core.pager '' # INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing @@ -54,18 +56,33 @@ RUN jupyter nbextension enable toc2/main # to get nbdiff between checkpointed version and current version of the given notebook USER root WORKDIR / - RUN conda install -y -c conda-forge nbdime - USER ${username} -WORKDIR ${HOME}/handson-ml +WORKDIR ${workdir} RUN git-nbdiffdriver config --enable --global -# INFO: Uncomment the RUN command below to ignore metadata in nbdiff within git diff +## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +## +RUN ls -l /tmp/ +COPY docker/nbdime-*.patch /tmp/ +RUN ls -l /tmp/ +USER root +WORKDIR / +RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-1-details.patch \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch +RUN rm /tmp/nbdime-*.patch +USER ${username} +WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc -RUN sudo rm -rf /tmp/bashrc +RUN sudo rm /tmp/bashrc diff --git a/docker/bashrc b/docker/bashrc index 3535389..b1bce45 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -1,4 +1,4 @@ -alias ll="ls -l" +alias ll="ls -alF" nbd() { DIRNAME=$(dirname "$1") @@ -8,5 +8,5 @@ nbd() { CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" + nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details } diff --git a/docker/nbdime-1-details.patch b/docker/nbdime-1-details.patch new file mode 100644 index 0000000..98f76d6 --- /dev/null +++ b/docker/nbdime-1-details.patch @@ -0,0 +1,17 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -548,8 +548,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True, meta + metadata_keys = ("/cells/*/metadata", "/metadata", "/cells/*/outputs/*/metadata") + if metadata: + for key in metadata_keys: +- if key in notebook_differs: +- del notebook_differs[key] ++ if details: ++ if key in notebook_differs: ++ del notebook_differs[key] ++ else: ++ notebook_differs[key] = diff_ignore_keys( ++ inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/docker/nbdime-2-toc.patch b/docker/nbdime-2-toc.patch new file mode 100644 index 0000000..4924e66 --- /dev/null +++ b/docker/nbdime-2-toc.patch @@ -0,0 +1,11 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -553,7 +553,7 @@ + del notebook_differs[key] + else: + notebook_differs[key] = diff_ignore_keys( +- inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) ++ inner_differ=diff, ignore_keys=['toc', 'collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore From 8586120c3d21f4b0b6c11db18fe86b7b3f22f8c1 Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 12:59:26 +0100 Subject: [PATCH 03/15] Git filter testing demo --- docker/Dockerfile | 22 ++++++++++++++------- docker/ipynb_cleaner.py | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) create mode 100755 docker/ipynb_cleaner.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 6b2852e..5daacee 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -40,9 +40,9 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -RUN mkdir -p ${HOME}/.jupyter && \ - echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ - >> ${HOME}/.jupyter/jupyter_notebook_config.py +#RUN mkdir -p ${HOME}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${HOME}/.jupyter/jupyter_notebook_config.py ## INFO: Uncomment the RUN command below to disable git diff paging RUN git config --global core.pager '' @@ -65,13 +65,11 @@ RUN git-nbdiffdriver config --enable --global ## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either # metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' -RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' -## -RUN ls -l /tmp/ +# INFO: Dirty nbdime patching COPY docker/nbdime-*.patch /tmp/ -RUN ls -l /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ @@ -86,3 +84,13 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc RUN sudo rm /tmp/bashrc + + +# INFO: Git filter testing +COPY docker/ipynb_cleaner.py /usr/bin/ipynb_cleaner +RUN mkdir -p ~/.config/git \ + && echo '*.ipynb filter=clean_ipynb' >> ~/.config/git/attributes \ + && git config --global filter.clean_ipynb.clean ipynb_cleaner \ + && git config --global filter.clean_ipynb.smudge cat + +# && git config --global filter.clean_ipynb.clean 'ipynb_cleaner %f' diff --git a/docker/ipynb_cleaner.py b/docker/ipynb_cleaner.py new file mode 100755 index 0000000..d34d7a6 --- /dev/null +++ b/docker/ipynb_cleaner.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +""" + +******************************** +DANGER - W.I.P. - TESTING ONLY!! +******************************** + +Clean jupyter notebook for git operations +Based on "Keeping IPython notebooks under Git version control" +(see: + https://gist.github.com/pbugnion/ea2797393033b54674af + http://pascalbugnion.net/blog/ipython-notebooks-and-git.html + http://stackoverflow.com/a/20844506/827862 +) +""" + +import sys +import json + +sys.stderr.write("\n\nCAUTION ! W.I.P ! Only dropping some test metadata, don't commit!\n\n") + +def log(x): + sys.stderr.write("\n\n[{}]\n\n\n".format(x)) +def logj(x): + sys.stderr.write("\n\n") + json.dump(x, sys.stderr, sort_keys=True, indent=1, separators=(",",": ")) + sys.stderr.write("\n\n") + +log(sys.argv) +#sys.exit(17) + +nb = sys.stdin.read() +json_in = json.loads(nb) + +logj(json_in["metadata"]) +del json_in["metadata"]["nav_menu"] +del json_in["metadata"]["toc"] +json_in["metadata"]["language_info"]["version"]="17.0" +logj(json_in["metadata"]) + +json.dump(json_in, sys.stdout, sort_keys=True, indent=1, separators=(",",": ")) From ef9df82689a0a530e50b0433033594f41cdb4af7 Mon Sep 17 00:00:00 2001 From: ziembla Date: Fri, 1 Dec 2017 10:56:36 +0100 Subject: [PATCH 04/15] Dockerfile publishable cleanup, git diff filter testing removed --- docker/Dockerfile | 54 +++++++++++++++++------------------------ docker/ipynb_cleaner.py | 42 -------------------------------- 2 files changed, 22 insertions(+), 74 deletions(-) delete mode 100755 docker/ipynb_cleaner.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 5daacee..e7efc36 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -32,28 +32,13 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main -## INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) -# That will switch jupyter to using empty password instead of a token. -# To avoid making a security hole you SHOULD in fact not only uncomment but -# regenerate the hash for your own non-empty password and replace the hash below. -# You can compute a password hash in any notebook, just run the code: -# from notebook.auth import passwd -# passwd() -# and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py - -## INFO: Uncomment the RUN command below to disable git diff paging -RUN git config --global core.pager '' - - -# INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing -# notebooks with nbdiff (and nbdiff support in git diff command) after connecting to -# the container by "make exec" (docker exec) -# Try: -# nbd NOTEBOOK_NAME.ipynb -# to get nbdiff between checkpointed version and current version of the given notebook +# INFO: Jupyter and nbdime extension are not totally integrated (anaconda image is py36, +# nbdime checks for py35 at the moment, still the config below enables diffing +# notebooks with nbdiff (and nbdiff support in git diff command) after connecting +# to the container by "make exec" (or "docker-compose exec handson-ml bash") +# You may also try running: +# nbd NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the given notebook USER root WORKDIR / RUN conda install -y -c conda-forge nbdime @@ -62,10 +47,10 @@ WORKDIR ${workdir} RUN git-nbdiffdriver config --enable --global -## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either # metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' -#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' # INFO: Dirty nbdime patching @@ -85,12 +70,17 @@ COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc RUN sudo rm /tmp/bashrc +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' -# INFO: Git filter testing -COPY docker/ipynb_cleaner.py /usr/bin/ipynb_cleaner -RUN mkdir -p ~/.config/git \ - && echo '*.ipynb filter=clean_ipynb' >> ~/.config/git/attributes \ - && git config --global filter.clean_ipynb.clean ipynb_cleaner \ - && git config --global filter.clean_ipynb.smudge cat - -# && git config --global filter.clean_ipynb.clean 'ipynb_cleaner %f' +# INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) +# That will switch jupyter to using empty password instead of a token. +# To avoid making a security hole you SHOULD in fact not only uncomment but +# regenerate the hash for your own non-empty password and replace the hash below. +# You can compute a password hash in any notebook, just run the code: +# from notebook.auth import passwd +# passwd() +# and take the hash from the output +#RUN mkdir -p ${HOME}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${HOME}/.jupyter/jupyter_notebook_config.py diff --git a/docker/ipynb_cleaner.py b/docker/ipynb_cleaner.py deleted file mode 100755 index d34d7a6..0000000 --- a/docker/ipynb_cleaner.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -""" - -******************************** -DANGER - W.I.P. - TESTING ONLY!! -******************************** - -Clean jupyter notebook for git operations -Based on "Keeping IPython notebooks under Git version control" -(see: - https://gist.github.com/pbugnion/ea2797393033b54674af - http://pascalbugnion.net/blog/ipython-notebooks-and-git.html - http://stackoverflow.com/a/20844506/827862 -) -""" - -import sys -import json - -sys.stderr.write("\n\nCAUTION ! W.I.P ! Only dropping some test metadata, don't commit!\n\n") - -def log(x): - sys.stderr.write("\n\n[{}]\n\n\n".format(x)) -def logj(x): - sys.stderr.write("\n\n") - json.dump(x, sys.stderr, sort_keys=True, indent=1, separators=(",",": ")) - sys.stderr.write("\n\n") - -log(sys.argv) -#sys.exit(17) - -nb = sys.stdin.read() -json_in = json.loads(nb) - -logj(json_in["metadata"]) -del json_in["metadata"]["nav_menu"] -del json_in["metadata"]["toc"] -json_in["metadata"]["language_info"]["version"]="17.0" -logj(json_in["metadata"]) - -json.dump(json_in, sys.stdout, sort_keys=True, indent=1, separators=(",",": ")) From 107de893049dea3afa26e432beb4158ceddf64ed Mon Sep 17 00:00:00 2001 From: ziembla Date: Fri, 1 Dec 2017 11:28:18 +0100 Subject: [PATCH 05/15] Nbdime patching ignored if the original file was changed --- docker/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e7efc36..a8fafa0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -53,14 +53,14 @@ RUN git-nbdiffdriver config --enable --global RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' -# INFO: Dirty nbdime patching +# INFO: Dirty nbdime patching (ignored if not matching) COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-1-details.patch \ + /tmp/nbdime-2-toc.patch || true \ && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch + /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch USER ${username} WORKDIR ${workdir} From ddb9784176586d618a9e6b4cc39f5f10ae6d19a1 Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 4 Dec 2017 11:33:16 +0100 Subject: [PATCH 06/15] tensorflow version unpined, tensorboard support, home variable fix --- docker/Dockerfile | 17 ++++++++--------- docker/README.md | 4 +++- docker/bashrc | 6 ++++++ docker/docker-compose.yml | 1 + 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a8fafa0..bfccb99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,20 +9,19 @@ RUN apt-get update && apt-get upgrade -y \ && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow=1.0.0 \ + tensorflow \ jupyter_contrib_nbextensions ARG username ARG userid +ARG home=/home/${username} +ARG workdir=${home}/handson-ml + RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ && chmod 0440 /etc/sudoers.d/${username} -ENV HOME /home/${username} - -ARG workdir=${HOME}/handson-ml - WORKDIR ${workdir} RUN chown ${username}:${username} ${workdir} @@ -58,7 +57,7 @@ COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch || true \ + /tmp/nbdime-1-details.patch || true \ && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch @@ -67,7 +66,7 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${HOME}/.bashrc +RUN cat /tmp/bashrc >> ${home}/.bashrc RUN sudo rm /tmp/bashrc # INFO: Uncomment the RUN command below to disable git diff paging @@ -81,6 +80,6 @@ RUN sudo rm /tmp/bashrc # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ +#RUN mkdir -p ${home}/.jupyter && \ # echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/README.md b/docker/README.md index 2355c45..037ae22 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,7 +32,9 @@ You can close the server just by pressing `Ctrl-C` in terminal window. Run `make exec` (or `docker-compose exec handson-ml bash`) while the server is running to run an additional `bash` shell inside the `handson-ml` container. Now you're inside the environment prepared within the image. -One of the usefull things that can be done there may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. +One of the usefull things that can be done there would be starting TensorBoard (for example with simple `tb` command, see bashrc file). + +Another one may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. You can see changes you made relative to the version in git using `git diff` which is integrated with `nbdiff`. diff --git a/docker/bashrc b/docker/bashrc index b1bce45..619677d 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -10,3 +10,9 @@ nbd() { # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details } + +tb() { + python -m tensorboard.main --logdir=tf_logs +} + +alias tensorboard="python -m tensorboard.main" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8a9718c..d4b46e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,7 @@ services: max-size: 50m ports: - "8888:8888" + - "6006:6006" volumes: - ../:/home/devel/handson-ml command: /opt/conda/bin/jupyter notebook --ip='*' --port=8888 --no-browser From 6e4004f16c8ffe31d61c7fc127feb0d8f947cc4b Mon Sep 17 00:00:00 2001 From: ziembla Date: Sat, 9 Dec 2017 20:17:56 +0100 Subject: [PATCH 07/15] scripts for jupyter notebooks cleanup, bin subdir on path --- docker/Dockerfile | 1 + docker/bashrc | 19 +----- docker/bin/nbclean_checkpoints | 116 +++++++++++++++++++++++++++++++++ docker/bin/nbdiff_checkpoint | 9 +++ docker/bin/rm_empty_subdirs | 54 +++++++++++++++ docker/bin/tensorboard | 2 + 6 files changed, 184 insertions(+), 17 deletions(-) create mode 100755 docker/bin/nbclean_checkpoints create mode 100755 docker/bin/nbdiff_checkpoint create mode 100755 docker/bin/rm_empty_subdirs create mode 100755 docker/bin/tensorboard diff --git a/docker/Dockerfile b/docker/Dockerfile index bfccb99..adf97f1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -67,6 +67,7 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${home}/.bashrc +RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc RUN sudo rm /tmp/bashrc # INFO: Uncomment the RUN command below to disable git diff paging diff --git a/docker/bashrc b/docker/bashrc index 619677d..ff19745 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -1,18 +1,3 @@ alias ll="ls -alF" - -nbd() { - DIRNAME=$(dirname "$1") - BASENAME=$(basename "$1" .ipynb) - - WORKING_COPY=$DIRNAME/$BASENAME.ipynb - CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb - - # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details -} - -tb() { - python -m tensorboard.main --logdir=tf_logs -} - -alias tensorboard="python -m tensorboard.main" +alias nbd="nbdiff_checkpoint" +alias tb="tensorboard --logdir=tf_logs" diff --git a/docker/bin/nbclean_checkpoints b/docker/bin/nbclean_checkpoints new file mode 100755 index 0000000..ba4aaf9 --- /dev/null +++ b/docker/bin/nbclean_checkpoints @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import collections +import glob +import hashlib +import os +import subprocess + + +class NotebookAnalyser: + + def __init__(self, dry_run=False, verbose=False, colorful=False): + self._dry_run = dry_run + self._verbose = verbose + self._colors = collections.defaultdict(lambda: "") + if colorful: + for color in [ + NotebookAnalyser.COLOR_WHITE, + NotebookAnalyser.COLOR_RED, + NotebookAnalyser.COLOR_GREEN, + NotebookAnalyser.COLOR_YELLOW, + ]: + self._colors[color] = "\033[{}m".format(color) + + NOTEBOOK_SUFFIX = ".ipynb" + CHECKPOINT_DIR = NOTEBOOK_SUFFIX + "_checkpoints" + CHECKPOINT_MASK = "*-checkpoint" + NOTEBOOK_SUFFIX + CHECKPOINT_MASK_LEN = len(CHECKPOINT_MASK) - 1 + + @staticmethod + def get_hash(file_path): + with open(file_path, "rb") as input: + hash = hashlib.md5() + for chunk in iter(lambda: input.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + MESSAGE_ORPHANED = "missing " + MESSAGE_MODIFIED = "modified" + MESSAGE_DELETED = "DELETING" + + COLOR_WHITE = "0" + COLOR_RED = "31" + COLOR_GREEN = "32" + COLOR_YELLOW = "33" + + def log(self, message, file, color=COLOR_WHITE): + color_on = self._colors[color] + color_off = self._colors[NotebookAnalyser.COLOR_WHITE] + print("{}{}{}: {}".format(color_on, message, color_off, file)) + + def clean_checkpoints(self, directory): + for checkpoint_path in sorted(glob.glob(os.path.join(directory, NotebookAnalyser.CHECKPOINT_MASK))): + + workfile_dir = os.path.dirname(os.path.dirname(checkpoint_path)) + workfile_name = os.path.basename(checkpoint_path)[:-NotebookAnalyser.CHECKPOINT_MASK_LEN] + NotebookAnalyser.NOTEBOOK_SUFFIX + workfile_path = os.path.join(workfile_dir, workfile_name) + + status = "" + if not os.path.isfile(workfile_path): + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_ORPHANED, workfile_path, NotebookAnalyser.COLOR_RED) + else: + checkpoint_stat = os.stat(checkpoint_path) + workfile_stat = os.stat(workfile_path) + + modified = workfile_stat.st_size != checkpoint_stat.st_size + + if not modified: + checkpoint_hash = NotebookAnalyser.get_hash(checkpoint_path) + workfile_hash = NotebookAnalyser.get_hash(workfile_path) + modified = checkpoint_hash != workfile_hash + + if modified: + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_MODIFIED, workfile_path, NotebookAnalyser.COLOR_YELLOW) + else: + self.log(NotebookAnalyser.MESSAGE_DELETED, checkpoint_path, NotebookAnalyser.COLOR_GREEN) + if not self._dry_run: + os.remove(checkpoint_path) + + if not self._dry_run and not os.listdir(directory): + self.log(NotebookAnalyser.MESSAGE_DELETED, directory, NotebookAnalyser.COLOR_GREEN) + os.rmdir(directory) + + def clean_checkpoints_recursively(self, directory): + for (root, subdirs, files) in os.walk(directory): + subdirs.sort() # INFO: traverse alphabetically + if NotebookAnalyser.CHECKPOINT_DIR in subdirs: + subdirs.remove(NotebookAnalyser.CHECKPOINT_DIR) # INFO: don't recurse there + self.clean_checkpoints(os.path.join(root, NotebookAnalyser.CHECKPOINT_DIR)) + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove checkpointed versions of those jupyter notebooks that are identical to their working copies.", + epilog="""Notebooks will be reported as either + "DELETED" if the working copy and checkpointed version are identical + (checkpoint will be deleted), + "missing" if there is a checkpoint but no corresponding working file can be found + or "modified" if notebook and the checkpoint are not byte-to-byte identical. + If removal of checkpoints results in empty ".ipynb_checkpoints" directory + that directory is also deleted. + """) #, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("dirs", metavar="DIR", type=str, nargs="*", default=".", help="directories to search") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("-c", "--color", action="store_true", help="colorful mode") + args = parser.parse_args() + + analyser = NotebookAnalyser(args.dry_run, args.verbose, args.color) + for directory in args.dirs: + analyser.clean_checkpoints_recursively(directory) + +if __name__ == "__main__": + main() diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint new file mode 100755 index 0000000..ffbb21c --- /dev/null +++ b/docker/bin/nbdiff_checkpoint @@ -0,0 +1,9 @@ +#!/bin/bash +DIRNAME=$(dirname "$1") +BASENAME=$(basename "$1" .ipynb) + +WORKING_COPY=$DIRNAME/$BASENAME.ipynb +CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb + +echo "How change $CHECKPOINT_COPY into $WORKING_COPY" +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs new file mode 100755 index 0000000..8734b84 --- /dev/null +++ b/docker/bin/rm_empty_subdirs @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import os + +def remove_empty_directories(initial_dir, + allow_initial_delete=False, ignore_nonexistant_initial=False, + dry_run=False, quiet=False): + + FORBIDDEN_SUBDIRS = set([".git"]) + + if not os.path.isdir(initial_dir) and not ignore_nonexistant_initial: + raise RuntimeError("Initial directory '{}' not found!".format(initial_dir)) + + message = "removed" + if dry_run: + message = "to be " + message + + deleted = set() + + for (directory, subdirs, files) in os.walk(initial_dir, topdown=False): + forbidden = False + parent = directory + while parent: + parent, dirname = os.path.split(parent) + if dirname in FORBIDDEN_SUBDIRS: + forbidden = True + break + if forbidden: + continue + + is_empty = len(files) < 1 and len(set([os.path.join(directory, s) for s in subdirs]) - deleted) < 1 + + if is_empty and (initial_dir != directory or allow_initial_delete): + if not quiet: + print("{}: {}".format(message, directory)) + deleted.add(directory) + if not dry_run: + os.rmdir(directory) + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") + parser.add_argument("dir", metavar="DIR", type=str, nargs="*", default=".", help="directory to be searched") + parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") + parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-q", "--quiet", action="store_true", help="don't print names of directories being removed") + args = parser.parse_args() + for directory in args.dir: + remove_empty_directories(directory, args.allow_dir_removal, args.ignore_nonexistent_dir, + args.dry_run, args.quiet) + +if __name__ == "__main__": + main() diff --git a/docker/bin/tensorboard b/docker/bin/tensorboard new file mode 100755 index 0000000..dd7294d --- /dev/null +++ b/docker/bin/tensorboard @@ -0,0 +1,2 @@ +#!/bin/bash +python -m tensorboard.main "$@" From 5bb9d6d3dfba750b7e0cbcfe26733b17e8685219 Mon Sep 17 00:00:00 2001 From: ziembla Date: Sun, 10 Dec 2017 18:38:25 +0100 Subject: [PATCH 08/15] help message for nbdiff_checkpoint --- docker/bin/nbdiff_checkpoint | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint index ffbb21c..2969e1b 100755 --- a/docker/bin/nbdiff_checkpoint +++ b/docker/bin/nbdiff_checkpoint @@ -1,9 +1,16 @@ #!/bin/bash +if [ "$#" -ne 1 ]; then + echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" + echo + echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" + exit +fi + DIRNAME=$(dirname "$1") BASENAME=$(basename "$1" .ipynb) WORKING_COPY=$DIRNAME/$BASENAME.ipynb CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb -echo "How change $CHECKPOINT_COPY into $WORKING_COPY" +echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details From 30fef69ed026ee117db464766bd95af8c7df1d5e Mon Sep 17 00:00:00 2001 From: ziembla Date: Sun, 10 Dec 2017 18:18:33 +0000 Subject: [PATCH 09/15] rm_empty_subdirs changed to require explicit argument (defaulting to current dir withdrawn as potentially harmful) --- docker/bin/rm_empty_subdirs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs index 8734b84..34f3ea9 100755 --- a/docker/bin/rm_empty_subdirs +++ b/docker/bin/rm_empty_subdirs @@ -40,7 +40,7 @@ def remove_empty_directories(initial_dir, def main(): import argparse parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") - parser.add_argument("dir", metavar="DIR", type=str, nargs="*", default=".", help="directory to be searched") + parser.add_argument("dir", metavar="DIR", type=str, nargs="+", help="directory to be searched") parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") From 1d370f40016b2f9fa88b6486b5c3a726f7aac473 Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 06:52:17 +0100 Subject: [PATCH 10/15] nbdiff_checkpoint parameter parsing fixed --- docker/bin/nbdiff_checkpoint | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint index 2969e1b..9ce7cd0 100755 --- a/docker/bin/nbdiff_checkpoint +++ b/docker/bin/nbdiff_checkpoint @@ -1,5 +1,5 @@ #!/bin/bash -if [ "$#" -ne 1 ]; then +if [[ "$#" -lt 1 || "$1" =~ ^((-h)|(--help))$ ]] ; then echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" echo echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" @@ -8,9 +8,10 @@ fi DIRNAME=$(dirname "$1") BASENAME=$(basename "$1" .ipynb) +shift WORKING_COPY=$DIRNAME/$BASENAME.ipynb CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" -nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details "$@" From 60bb0e4e502bdc711ca5d339b2e2d2692195c14c Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 16:19:24 +0100 Subject: [PATCH 11/15] Uncommentable section in Dockerfile to autosave .py and .html alongside .ipynb --- docker/Dockerfile | 13 ++++++++++--- docker/{bashrc => bashrc.bash} | 0 docker/jupyter_notebook_config.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) rename docker/{bashrc => bashrc.bash} (100%) create mode 100644 docker/jupyter_notebook_config.py diff --git a/docker/Dockerfile b/docker/Dockerfile index adf97f1..2d24d04 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -65,10 +65,17 @@ USER ${username} WORKDIR ${workdir} -COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${home}/.bashrc +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc -RUN sudo rm /tmp/bashrc +RUN sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py # INFO: Uncomment the RUN command below to disable git diff paging #RUN git config --global core.pager '' diff --git a/docker/bashrc b/docker/bashrc.bash similarity index 100% rename from docker/bashrc rename to docker/bashrc.bash diff --git a/docker/jupyter_notebook_config.py b/docker/jupyter_notebook_config.py new file mode 100644 index 0000000..971a49a --- /dev/null +++ b/docker/jupyter_notebook_config.py @@ -0,0 +1,15 @@ +import os +import subprocess + +def export_script_and_view(model, os_path, contents_manager): + if model["type"] != "notebook": + return + dir_name, file_name = os.path.split(os_path) + file_base, file_ext = os.path.splitext(file_name) + if file_base.startswith("Untitled"): + return + export_name = file_base if file_ext == ".ipynb" else file_name + subprocess.check_call(["jupyter", "nbconvert", "--to", "script", file_name, "--output", export_name + "_script"], cwd=dir_name) + subprocess.check_call(["jupyter", "nbconvert", "--to", "html", file_name, "--output", export_name + "_view"], cwd=dir_name) + +c.FileContentsManager.post_save_hook = export_script_and_view From 9dfaa950d2091e7f37ddba996c68c60e79e05c3b Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 22:02:42 +0100 Subject: [PATCH 12/15] Dockerfile to spaces --- docker/Dockerfile | 30 +++++++++++++++--------------- docker/Makefile | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2d24d04..b4ec526 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,16 @@ FROM continuumio/anaconda3 RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y \ - libpq-dev \ - build-essential \ - git \ - sudo \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y \ + libpq-dev \ + build-essential \ + git \ + sudo \ + && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow \ - jupyter_contrib_nbextensions + tensorflow \ + jupyter_contrib_nbextensions ARG username ARG userid @@ -19,8 +19,8 @@ ARG home=/home/${username} ARG workdir=${home}/handson-ml RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ - && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ - && chmod 0440 /etc/sudoers.d/${username} + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} WORKDIR ${workdir} RUN chown ${username}:${username} ${workdir} @@ -57,9 +57,9 @@ COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-1-details.patch || true \ - && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch || true + /tmp/nbdime-1-details.patch || true \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch USER ${username} WORKDIR ${workdir} @@ -89,5 +89,5 @@ RUN sudo rm /tmp/bashrc.bash # passwd() # and take the hash from the output #RUN mkdir -p ${home}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${home}/.jupyter/jupyter_notebook_config.py +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/Makefile b/docker/Makefile index 6078fc9..f85c49a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -4,7 +4,7 @@ help: run: docker-compose up exec: - docker-compose exec handson-ml /bin/bash + docker-compose exec handson-ml bash build: stop .FORCE docker-compose build rebuild: stop .FORCE From ed40ca2be3bd4c569e3125df0863810177c833da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 21:35:51 +0100 Subject: [PATCH 13/15] Add comment about Python 3.6 SSL issue on MacOSX, fixes #145 --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bbec5aa..96c0fb3 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ Of course, you obviously need Python. Python 2 is already preinstalled on most s $ python --version # for Python 2 $ python3 --version # for Python 3 -Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: +Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). If you are using Python 3.6 on MacOSX, you need to run the following command to install the `certifi` package of certificates because Python 3.6 on MacOSX has no certificates to validate SSL connections (see this [StackOverflow question](https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error)): + + $ /Applications/Python\ 3.6/Install\ Certificates.command + +On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: $ sudo apt-get update $ sudo apt-get install python3 @@ -49,9 +53,9 @@ When using Anaconda, you can optionally create an isolated Python environment de This creates a fresh Python 3.5 environment called `mlbook` (you can change the name if you want to), and it activates it. This environment contains all the scientific libraries that come with Anaconda. This includes all the libraries we will need (NumPy, Matplotlib, Pandas, Jupyter and a few others), except for TensorFlow, so let's install it: - $ conda install -n mlbook -c conda-forge tensorflow=1.0.0 + $ conda install -n mlbook -c conda-forge tensorflow=1.4.0 -This installs TensorFlow 1.0.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. +This installs TensorFlow 1.4.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. Next, you can optionally install Jupyter extensions. These are useful to have nice tables of contents in the notebooks, but they are not required. From f2020952d00a7343e130f4b78b07d6b063ef4cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 22:40:17 +0100 Subject: [PATCH 14/15] Fix error in MyLinearSVC, fixes #140 --- 05_support_vector_machines.ipynb | 660 ++++++------------------------- 1 file changed, 121 insertions(+), 539 deletions(-) diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 687d74b..abbc1c1 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 5 – Support Vector Machines**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -35,11 +26,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -74,20 +61,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin classification" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after:" ] @@ -95,11 +76,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -121,11 +98,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Bad models\n", @@ -179,10 +152,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to feature scales" ] @@ -190,11 +160,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", @@ -230,10 +196,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to outliers" ] @@ -241,11 +204,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])\n", @@ -295,20 +254,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin *vs* margin violations" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This is the first code example in chapter 5:" ] @@ -316,11 +269,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -344,11 +293,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf.predict([[5.5, 1.7]])" @@ -356,10 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's generate the graph comparing different regularization settings:" ] @@ -367,11 +309,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -394,11 +332,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to unscaled parameters\n", @@ -422,11 +356,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,3.2))\n", @@ -454,9 +384,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "# Non-linear classification" @@ -465,11 +393,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X1D = np.linspace(-4, 4, 9).reshape(-1, 1)\n", @@ -508,11 +432,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -533,11 +453,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -556,11 +472,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_predictions(clf, axes):\n", @@ -583,11 +495,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -602,11 +510,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "poly100_kernel_svm_clf = Pipeline([\n", @@ -619,11 +523,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(11, 4))\n", @@ -646,9 +546,6 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -716,11 +613,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x1_example = X1D[3, 0]\n", @@ -732,11 +625,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rbf_kernel_svm_clf = Pipeline([\n", @@ -750,9 +639,6 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -787,10 +673,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regression\n" ] @@ -798,11 +681,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -814,11 +693,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -830,11 +705,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", @@ -857,11 +728,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_svm_regression(svm_reg, X, y, axes):\n", @@ -898,11 +765,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -914,11 +777,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -930,11 +789,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -948,11 +803,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 4))\n", @@ -969,10 +820,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Under the hood" ] @@ -980,11 +828,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "iris = datasets.load_iris()\n", @@ -995,11 +839,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", @@ -1042,10 +882,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Small weight vector results in a large margin" ] @@ -1053,11 +890,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n", @@ -1091,11 +924,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -1112,10 +941,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Hinge loss" ] @@ -1123,11 +949,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-2, 4, 200)\n", @@ -1148,20 +970,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Extra material" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Training time" ] @@ -1169,11 +985,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n", @@ -1184,11 +996,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -1210,10 +1018,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Linear SVM classifier implementation using Batch Gradient Descent" ] @@ -1221,11 +1026,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Training set\n", @@ -1236,11 +1037,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", @@ -1286,7 +1083,7 @@ "\n", " self.intercept_ = np.array([b])\n", " self.coef_ = np.array([w])\n", - " support_vectors_idx = (X_t.dot(w) + b < 1).ravel()\n", + " support_vectors_idx = (X_t.dot(w) + t * b < 1).ravel()\n", " self.support_vectors_ = X[support_vectors_idx]\n", " return self\n", "\n", @@ -1305,11 +1102,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(range(svm_clf.n_epochs), svm_clf.Js)\n", @@ -1319,11 +1112,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "print(svm_clf.intercept_, svm_clf.coef_)" @@ -1332,11 +1121,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf2 = SVC(kernel=\"linear\", C=C)\n", @@ -1347,11 +1132,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "yr = y.ravel()\n", @@ -1378,9 +1159,6 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -1412,20 +1190,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 7." ] @@ -1433,9 +1205,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "See appendix A." @@ -1443,30 +1213,21 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 8." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's use the Iris dataset: the Iris Setosa and Iris Versicolor classes are linearly separable." ] @@ -1475,9 +1236,7 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1495,11 +1254,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC, LinearSVC\n", @@ -1528,10 +1283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's plot the decision boundaries of these three models:" ] @@ -1539,11 +1291,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Compute the slope and bias of each decision boundary\n", @@ -1576,40 +1324,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Close enough!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 9." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's load the dataset and split it into a training set and a test set. We could use `train_test_split()` but people usually just take the first 60,000 instances for the training set, and the last 10,000 instances for the test set (this makes it possible to compare your model's performance with others): " ] @@ -1617,11 +1353,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_mldata\n", @@ -1638,10 +1370,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:" ] @@ -1650,9 +1379,7 @@ "cell_type": "code", "execution_count": 48, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1664,10 +1391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!" ] @@ -1675,11 +1399,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1688,10 +1408,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):" ] @@ -1699,11 +1416,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", @@ -1714,10 +1427,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Wow, 82% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] @@ -1725,11 +1435,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -1740,11 +1446,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1754,11 +1456,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = lin_clf.predict(X_train_scaled)\n", @@ -1767,10 +1465,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default).\n", "\n", @@ -1780,11 +1475,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf = SVC(decision_function_shape=\"ovr\")\n", @@ -1794,11 +1485,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = svm_clf.predict(X_train_scaled)\n", @@ -1807,10 +1494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's promising, we get better performance even though we trained the model on 6 times less data. Let's tune the hyperparameters by doing a randomized search with cross validation. We will do this on a small dataset just to speed up the process:" ] @@ -1818,11 +1502,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", @@ -1836,11 +1516,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -1849,11 +1525,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_score_" @@ -1861,10 +1533,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This looks pretty low but remember we only trained the model on 1,000 instances. Let's retrain the best estimator on the whole training set (run this at night, it will take hours):" ] @@ -1872,11 +1541,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)" @@ -1885,11 +1550,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -1898,10 +1559,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Ah, this looks good! Let's select this model. Now we can test it on the test set:" ] @@ -1909,11 +1567,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -1922,40 +1576,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing `C` and/or `gamma`), but we would run the risk of overfitting the test set. Other people have found that the hyperparameters `C=5` and `gamma=0.005` yield even better performance (over 98% accuracy). By running the randomized search for longer and on a larger part of the training set, you may be able to find this as well." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 10." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM regressor on the California housing dataset._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's load the dataset using Scikit-Learn's `fetch_california_housing()` function:" ] @@ -1964,9 +1606,7 @@ "cell_type": "code", "execution_count": 62, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1979,10 +1619,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Split it into a training set and a test set:" ] @@ -1991,9 +1628,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2004,10 +1639,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Don't forget to scale the data:" ] @@ -2016,9 +1648,7 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2031,10 +1661,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's train a simple `LinearSVR` first:" ] @@ -2042,11 +1669,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -2057,10 +1680,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's see how it performs on the training set:" ] @@ -2068,11 +1688,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -2084,10 +1700,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's look at the RMSE:" ] @@ -2095,11 +1708,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(mse)" @@ -2107,10 +1716,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In this training set, the targets are tens of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors somewhere around $10,000. Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" ] @@ -2118,11 +1724,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -2137,11 +1739,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -2149,10 +1747,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's measure the RMSE on the training set:" ] @@ -2160,11 +1755,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -2174,10 +1765,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks much better than the linear model. Let's select this model and evaluate it on the test set:" ] @@ -2185,11 +1773,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -2201,9 +1785,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -2225,7 +1807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -2239,5 +1821,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 1eaa53a6a21cd22a4a10c125e9dcb78c25b35375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 23:14:20 +0100 Subject: [PATCH 15/15] Add thanks to contributors in README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 96c0fb3..faa749c 100644 --- a/README.md +++ b/README.md @@ -109,3 +109,6 @@ This should open up your browser, and you should see Jupyter's tree view, with t Note: you can also visit [http://localhost:8888/nbextensions](http://localhost:8888/nbextensions) to activate and configure Jupyter extensions. Congrats! You are ready to learn Machine Learning, hands on! + +# Contributors +I would like to thank everyone who contributed to this project, either by providing useful feedback, filing issues or submitting Pull Requests. Special thanks go to Steven Bunkley and Ziembla who created the `docker` directory.