From 3b224bb7dad464cfc9165606c468ece07a8b7e8f Mon Sep 17 00:00:00 2001 From: Alexander Kolosov Date: Tue, 9 Jun 2020 06:13:04 +0000 Subject: [PATCH 1/8] Initial commit --- README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..4e051c1 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# dvc-lesson-4 + From d1352356bb87bc620815ad32c0ddd2b37ef75b14 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 23 Oct 2020 08:32:29 +0000 Subject: [PATCH 2/8] Release 1.0 --- .gitignore | 2 +- README.md | 34 +- dvc-3-automate-experiments.ipynb | 1329 +++++++++++------------------- requirements.txt | 19 +- src/evaluate.py | 6 - 5 files changed, 498 insertions(+), 892 deletions(-) diff --git a/.gitignore b/.gitignore index 69de92e..f6307b8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ .idea ## Python -venv* +dvc-venv __pycache__ .ipynb_checkpoints diff --git a/README.md b/README.md index cff8466..705a66a 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,42 @@ -# Tutorial: dvc-3-automate-experiments +# Tutorial: Automate DVC experiments (lesson 3) +## Machine Learning experiments reproducibility and engineering with DVC // ML REPA School -## 1. clone this repository +## 1. Clone this repository ```bash -git clone https://gitlab.com/7labs.ru/tutorials-dvc/dvc-3-automate-experiments.git +git https://github.com/mlrepa/dvc-3-automate-experiments.git cd dvc-3-automate-experiments ``` ## 2. Create and activate virtual environment -Install virtualenv in advance: - +Create virtual environment named `dvc` (you may use other name) ```bash -pip install virtualenv +python3 -m venv dvc-venv +source dvc-venv/bin/activate ``` -Create virtual environment -```bash -virtualenv venv-dvc-3-automate-experiments -source venv-dvc-3-automate-experiments/bin/activate -``` - -## 3. Install python libraries (including dvc) +## 3. Install python libraries ```bash pip install -r requirements.txt ``` - ## 4. Add Virtual Environment to Jupyter Notebook ```bash -python -m ipykernel install --user --name=venv-dvc-3-automate-experiments +python -m ipykernel install --user --name=dvc-venv ``` -## 5. Run and follow Jupyter Notebook `dvc-3-automate-experiments.ipynb` for instructions: +## 5. Configure ToC for jupyter notebook (optional) ```bash -jupyter notebook +sudo jupyter contrib nbextension install +jupyter nbextension enable toc2/main ``` +## 6. Run and follow Jupyter Notebook `dvc-3-automate-experiments.ipynb` for instructions: + +```bash +jupyter notebook +``` diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb index 74371dd..32c7c91 100644 --- a/dvc-3-automate-experiments.ipynb +++ b/dvc-3-automate-experiments.ipynb @@ -12,120 +12,6 @@ "\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install with pip" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:18.843826Z", - "start_time": "2020-07-01T07:32:16.105734Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting dvc==1.5.0\n", - " Using cached dvc-1.5.0-py2.py3-none-any.whl (445 kB)\n", - "Collecting ruamel.yaml>=0.16.1\n", - " Using cached ruamel.yaml-0.16.10-py2.py3-none-any.whl (111 kB)\n", - "Collecting shortuuid>=0.5.0\n", - " Using cached shortuuid-1.0.1-py3-none-any.whl (7.5 kB)\n", - "Collecting shtab<2,>=1.3.0\n", - " Using cached shtab-1.3.1-py2.py3-none-any.whl (12 kB)\n", - "Collecting pydot>=1.2.4\n", - " Using cached pydot-1.4.1-py2.py3-none-any.whl (19 kB)\n", - "Collecting rich>=3.0.5\n", - " Using cached rich-5.2.0-py3-none-any.whl (145 kB)\n", - "Collecting tabulate>=0.8.7\n", - " Using cached tabulate-0.8.7-py3-none-any.whl (24 kB)\n", - "Processing /home/alex/.cache/pip/wheels/3c/33/97/805b282e129f60bb4e87cea622338f30b65f21eaf65219971f/funcy-1.14-py2.py3-none-any.whl\n", - "Processing /home/alex/.cache/pip/wheels/49/68/a0/8e7cb7bbf4990fc10b5a082aa0eb3ac66787ca11e8eca445b2/flufl.lock-3.2-py3-none-any.whl\n", - "Collecting pyasn1>=0.4.1\n", - " Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n", - "Collecting appdirs>=1.4.3\n", - " Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n", - "Requirement already satisfied: setuptools>=34.0.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (47.1.1)\n", - "Collecting tqdm<5,>=4.45.0\n", - " Using cached tqdm-4.48.2-py2.py3-none-any.whl (68 kB)\n", - "Processing /home/alex/.cache/pip/wheels/bc/f8/ae/bc69cb5f61393ebf9ade4cde41d1a813d35bfe78263a26f99e/dpath-2.0.1-py3-none-any.whl\n", - "Collecting grandalf==0.6\n", - " Using cached grandalf-0.6-py3-none-any.whl (31 kB)\n", - "Processing /home/alex/.cache/pip/wheels/b8/92/aa/456d462c908b4e210c3928f778d28f94049fc9e47af8b191c9/nanotime-0.5.2-py3-none-any.whl\n", - "Collecting flatten-json<0.1.8,>=0.1.6\n", - " Using cached flatten_json-0.1.7-py3-none-any.whl (6.4 kB)\n", - "Processing /home/alex/.cache/pip/wheels/ce/22/5c/bcd55db68399954d13c8d3b23192a517dd59ba3ee8648fa773/pygtrie-2.3.2-py3-none-any.whl\n", - "Requirement already satisfied: packaging>=19.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (20.4)\n", - "Requirement already satisfied: PyYAML<5.4,>=5.1.2 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from dvc==1.5.0) (5.3)\n", - "Processing /home/alex/.cache/pip/wheels/17/a2/0a/00fa5a0d6f271c82fc59be9ae47173bb6e6a462d4361224072/jsonpath_ng-1.5.1-py3-none-any.whl\n", - "Collecting colorama>=0.3.9\n", - " Using cached colorama-0.4.3-py2.py3-none-any.whl (15 kB)\n", - "Collecting toml>=0.10.1\n", - " Using cached toml-0.10.1-py2.py3-none-any.whl (19 kB)\n", - "Collecting pathspec>=0.6.0\n", - " Using cached pathspec-0.8.0-py2.py3-none-any.whl (28 kB)\n", - "Collecting gitpython>3\n", - " Using cached GitPython-3.1.7-py3-none-any.whl (158 kB)\n", - "Collecting networkx<2.5,>=2.1\n", - " Using cached networkx-2.4-py3-none-any.whl (1.6 MB)\n", - "Collecting ply>=3.9\n", - " Using cached ply-3.11-py2.py3-none-any.whl (49 kB)\n", - "Processing /home/alex/.cache/pip/wheels/0d/c4/19/13d74440f2a571841db6b6e0a273694327498884dafb9cf978/configobj-5.0.6-py3-none-any.whl\n", - "Collecting distro>=1.3.0\n", - " Using cached distro-1.5.0-py2.py3-none-any.whl (18 kB)\n", - "Collecting requests>=2.22.0\n", - " Using cached requests-2.24.0-py2.py3-none-any.whl (61 kB)\n", - "Processing /home/alex/.cache/pip/wheels/af/ee/20/047a79ba5ff692baa2f7e2e95c0cd57061a1673d59f5acf0d5/voluptuous-0.11.7-py3-none-any.whl\n", - "Collecting zc.lockfile>=1.2.1\n", - " Using cached zc.lockfile-2.0-py2.py3-none-any.whl (9.7 kB)\n", - "Collecting ruamel.yaml.clib>=0.1.2; platform_python_implementation == \"CPython\" and python_version < \"3.9\"\n", - " Using cached ruamel.yaml.clib-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (547 kB)\n", - "Requirement already satisfied: pyparsing>=2.1.4 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from pydot>=1.2.4->dvc==1.5.0) (2.4.7)\n", - "Collecting typing-extensions<4.0.0,>=3.7.4\n", - " Using cached typing_extensions-3.7.4.2-py3-none-any.whl (22 kB)\n", - "Collecting commonmark<0.10.0,>=0.9.0\n", - " Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from rich>=3.0.5->dvc==1.5.0) (2.6.1)\n", - "Processing /home/alex/.cache/pip/wheels/3e/5d/46/fa3cbde0ab8c53dbdd14658b3a4c97035b8851369ce8e79649/atpublic-2.0-py3-none-any.whl\n", - "Processing /home/alex/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e/future-0.18.2-cp37-none-any.whl\n", - "Requirement already satisfied: six in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from packaging>=19.0->dvc==1.5.0) (1.15.0)\n", - "Requirement already satisfied: decorator in ./venv-dvc-3-automate-experiments/lib/python3.7/site-packages (from jsonpath-ng>=1.5.1->dvc==1.5.0) (4.4.2)\n", - "Collecting gitdb<5,>=4.0.1\n", - " Using cached gitdb-4.0.5-py3-none-any.whl (63 kB)\n", - "Collecting certifi>=2017.4.17\n", - " Using cached certifi-2020.6.20-py2.py3-none-any.whl (156 kB)\n", - "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", - " Using cached urllib3-1.25.10-py2.py3-none-any.whl (127 kB)\n", - "Collecting chardet<4,>=3.0.2\n", - " Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)\n", - "Collecting idna<3,>=2.5\n", - " Using cached idna-2.10-py2.py3-none-any.whl (58 kB)\n", - "Collecting smmap<4,>=3.0.1\n", - " Using cached smmap-3.0.4-py2.py3-none-any.whl (25 kB)\n", - "Installing collected packages: ruamel.yaml.clib, ruamel.yaml, shortuuid, shtab, pydot, colorama, typing-extensions, commonmark, rich, tabulate, funcy, atpublic, flufl.lock, pyasn1, appdirs, tqdm, dpath, future, grandalf, nanotime, flatten-json, pygtrie, ply, jsonpath-ng, toml, pathspec, smmap, gitdb, gitpython, networkx, configobj, distro, certifi, urllib3, chardet, idna, requests, voluptuous, zc.lockfile, dvc\n", - " Attempting uninstall: tqdm\n", - " Found existing installation: tqdm 4.42.0\n", - " Uninstalling tqdm-4.42.0:\n", - " Successfully uninstalled tqdm-4.42.0\n", - "Successfully installed appdirs-1.4.4 atpublic-2.0 certifi-2020.6.20 chardet-3.0.4 colorama-0.4.3 commonmark-0.9.1 configobj-5.0.6 distro-1.5.0 dpath-2.0.1 dvc-1.5.0 flatten-json-0.1.7 flufl.lock-3.2 funcy-1.14 future-0.18.2 gitdb-4.0.5 gitpython-3.1.7 grandalf-0.6 idna-2.10 jsonpath-ng-1.5.1 nanotime-0.5.2 networkx-2.4 pathspec-0.8.0 ply-3.11 pyasn1-0.4.8 pydot-1.4.1 pygtrie-2.3.2 requests-2.24.0 rich-5.2.0 ruamel.yaml-0.16.10 ruamel.yaml.clib-0.2.0 shortuuid-1.0.1 shtab-1.3.1 smmap-3.0.4 tabulate-0.8.7 toml-0.10.1 tqdm-4.48.2 typing-extensions-3.7.4.2 urllib3-1.25.10 voluptuous-0.11.7 zc.lockfile-2.0\n", - "\u001b[33mWARNING: You are using pip version 20.1.1; however, version 20.2.2 is available.\n", - "You should consider upgrading via the '/home/alex/Dev/Projects/tutorials/tutorials-dvc/dvc-3-automate-experiments/venv-dvc-3-automate-experiments/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install dvc==1.5.0" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -134,25 +20,17 @@ ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-01T07:32:19.401395Z", "start_time": "2020-07-01T07:32:19.271265Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'dvc-tutorial'\r\n" - ] - } - ], "source": [ - "!git checkout -b dvc-tutorial" + "```bash\n", + "git checkout -b dvc-tutorial\n", + "```" ] }, { @@ -171,41 +49,17 @@ ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-01T07:32:22.463407Z", "start_time": "2020-07-01T07:32:21.450728Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "You can now commit the changes to git.\n", - "\n", - "\u001b[31m+---------------------------------------------------------------------+\n", - "\u001b[39m\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m DVC has enabled anonymous aggregate usage analytics. \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m Read the analytics documentation (and how to opt-out) here: \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m \u001b[34mhttps://dvc.org/doc/user-guide/analytics\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m|\u001b[39m \u001b[31m|\u001b[39m\n", - "\u001b[31m+---------------------------------------------------------------------+\n", - "\u001b[39m\n", - "\u001b[33mWhat's next?\u001b[39m\n", - "\u001b[33m------------\u001b[39m\n", - "- Check out the documentation: \u001b[34mhttps://dvc.org/doc\u001b[39m\n", - "- Get help and share ideas: \u001b[34mhttps://dvc.org/chat\u001b[39m\n", - "- Star us on GitHub: \u001b[34mhttps://github.com/iterative/dvc\u001b[39m\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc init" + "```bash\n", + "dvc init\n", + "```" ] }, { @@ -216,35 +70,19 @@ ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-01T07:32:26.446894Z", "start_time": "2020-07-01T07:32:26.392814Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dvc-tutorial f285905] Initialize DVC\n", - " 6 files changed, 128 insertions(+)\n", - " create mode 100644 .dvc/.gitignore\n", - " create mode 100644 .dvc/config\n", - " create mode 100644 .dvc/plots/confusion.json\n", - " create mode 100644 .dvc/plots/default.json\n", - " create mode 100644 .dvc/plots/scatter.json\n", - " create mode 100644 .dvc/plots/smooth.json\n" - ] - } - ], "source": [ - "%%bash\n", + "```bash\n", "\n", "git add .\n", - "git commit -m \"Initialize DVC\"" + "git commit -m \"Initialize DVC\"\n", + "```" ] }, { @@ -262,64 +100,53 @@ ] }, { - "cell_type": "code", - "execution_count": 94, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:35.023136Z", "start_time": "2020-07-03T19:30:34.904974Z" } }, - "outputs": [], "source": [ - "!mkdir -p data" + "```bash\n", + "# Create `data` directory\n", + "\n", + "mkdir -p data\n", + "```" ] }, { - "cell_type": "code", - "execution_count": 95, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.406056Z", - "start_time": "2020-07-03T19:30:35.351794Z" + "end_time": "2020-10-21T09:58:42.844179Z", + "start_time": "2020-10-21T09:58:42.840016Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'data_load' from run-cache \n", - "Skipping run, checking out outputs\n", - "Creating 'dvc.yaml'\n", - "Adding stage 'data_load' in 'dvc.yaml'\n", - "Generating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml .dvc/.gitignore\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc run -n data_load \\\n", + "```bash\n", + "# Create data_load pipeline stage\n", + "\n", + "dvc run -n data_load \\\n", " -d src/data_load.py \\\n", " -o data/iris.csv \\\n", " -o data/classes.json \\\n", " -p data_load \\\n", " python src/data_load.py \\\n", - " --config=params.yaml" + " --config=params.yaml\n", + "\n", + "```" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.455211Z", - "start_time": "2020-07-03T19:30:37.433214Z" - } + "end_time": "2020-10-21T13:42:01.176530Z", + "start_time": "2020-10-21T13:42:01.147399Z" + }, + "scrolled": true }, "outputs": [ { @@ -327,13 +154,7 @@ "output_type": "stream", "text": [ "4.0K\tdata/classes.json\n", - "4.0K\tdata/cm.csv\n", - "4.0K\tdata/iris.csv\n", - "8.0K\tdata/iris_featurized.csv\n", - "4.0K\tdata/metrics.json\n", - "8.0K\tdata/model.joblib\n", - "4.0K\tdata/test.csv\n", - "8.0K\tdata/train.csv\n" + "4.0K\tdata/iris.csv\n" ] } ], @@ -345,11 +166,11 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.604922Z", - "start_time": "2020-07-03T19:30:37.479654Z" + "end_time": "2020-10-21T10:03:42.595865Z", + "start_time": "2020-10-21T10:03:42.471874Z" } }, "outputs": [ @@ -359,18 +180,7 @@ "text": [ "\u001b[01;34m.\u001b[00m\r\n", "├── README.md\r\n", - "├── \u001b[01;34mdata\u001b[00m\r\n", - "│   ├── classes.json\r\n", - "│   ├── cm.csv\r\n", - "│   ├── iris.csv\r\n", - "│   ├── iris_featurized.csv\r\n", - "│   ├── metrics.json\r\n", - "│   ├── model.joblib\r\n", - "│   ├── test.csv\r\n", - "│   └── train.csv\r\n", "├── dvc-3-automate-experiments.ipynb\r\n", - "├── dvc.lock\r\n", - "├── dvc.yaml\r\n", "├── params.yaml\r\n", "├── requirements.txt\r\n", "└── \u001b[01;34msrc\u001b[00m\r\n", @@ -381,12 +191,14 @@ " ├── split_dataset.py\r\n", " └── train.py\r\n", "\r\n", - "2 directories, 20 files\r\n" + "1 directory, 10 files\r\n" ] } ], "source": [ - "!tree -I venv-dvc-3-automate-experiments" + "# Note: we use `tree -I ...` pattern to not list those files that match the wild-card pattern.\n", + "\n", + "!tree -I dvc-venv" ] }, { @@ -398,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:37.727096Z", @@ -436,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:37.877998Z", @@ -488,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:39.781553Z", @@ -500,8 +312,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Data and pipelines are up to date.\n", + "Stage 'data_load' is cached - skipping run, checking out outputs core\u001b[39m>\n", "\u001b[0m" ] } @@ -522,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:41.698409Z", @@ -534,8 +345,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Data and pipelines are up to date.\n", + "Running stage 'data_load' with command: core\u001b[39m>\n", + "\tpython src/data_load.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "\n", + "To track the changes with git, run:\n", + "\n", + "\tgit add dvc.lock\n", "\u001b[0m" ] } @@ -564,48 +380,34 @@ ] }, { - "cell_type": "code", - "execution_count": 103, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:45.387596Z", "start_time": "2020-07-03T19:30:43.388868Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'feature_extraction' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'feature_extraction' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc run -n feature_extraction \\\n", + "```bash\n", + "\n", + "dvc run -n feature_extraction \\\n", " -d src/featurization.py \\\n", " -d data/iris.csv \\\n", " -o data/iris_featurized.csv \\\n", " -p data_load,featurize \\\n", " python src/featurization.py \\\n", - " --config=params.yaml" + " --config=params.yaml\n", + "\n", + "```" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.561869Z", - "start_time": "2020-07-03T19:30:45.439521Z" + "end_time": "2020-10-21T14:22:48.664322Z", + "start_time": "2020-10-21T14:22:48.539481Z" } }, "outputs": [ @@ -613,11 +415,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "README.md params.yaml\r\n", - "\u001b[1m\u001b[36mdata\u001b[m\u001b[m requirements.txt\r\n", - "dvc-3-automate-experiments.ipynb \u001b[1m\u001b[36msrc\u001b[m\u001b[m\r\n", - "dvc.lock \u001b[1m\u001b[36mvenv-dvc-3-automate-experiments\u001b[m\u001b[m\r\n", - "dvc.yaml\r\n" + "README.md dvc.yaml\r\n", + "\u001b[1m\u001b[36mdata\u001b[m\u001b[m params.yaml\r\n", + "dvc-3-automate-experiments.ipynb requirements.txt\r\n", + "\u001b[1m\u001b[36mdvc-venv\u001b[m\u001b[m \u001b[1m\u001b[36msrc\u001b[m\u001b[m\r\n", + "dvc.lock\r\n" ] } ], @@ -627,11 +429,11 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.706627Z", - "start_time": "2020-07-03T19:30:45.585641Z" + "end_time": "2020-10-21T14:22:55.648732Z", + "start_time": "2020-10-21T14:22:55.526727Z" } }, "outputs": [ @@ -668,11 +470,11 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.745702Z", - "start_time": "2020-07-03T19:30:45.734321Z" + "end_time": "2020-10-21T14:22:58.124155Z", + "start_time": "2020-10-21T14:22:57.646371Z" } }, "outputs": [ @@ -758,7 +560,7 @@ "4 5.0 3.6 1.4 0.2 0" ] }, - "execution_count": 106, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -771,55 +573,43 @@ ] }, { - "cell_type": "code", - "execution_count": 107, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Commit changes " + ] + }, + { + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.893549Z", - "start_time": "2020-07-03T19:30:45.763986Z" + "end_time": "2020-10-21T12:58:37.280454Z", + "start_time": "2020-10-21T12:58:37.272728Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31m??\u001b[m .dvc/\r\n", - "\u001b[31m??\u001b[m dvc.lock\r\n", - "\u001b[31m??\u001b[m dvc.yaml\r\n" - ] - } - ], "source": [ - "!git status -s" + "```bash\n", + "# Check Git status\n", + "\n", + "git status -s\n", + "```" ] }, { - "cell_type": "code", - "execution_count": 108, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:45.961182Z", "start_time": "2020-07-03T19:30:45.916816Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev 0ae7569] Add stage features_extraction\n", - " 3 files changed, 56 insertions(+)\n", - " create mode 100644 .dvc/.gitignore\n", - " create mode 100644 dvc.lock\n", - " create mode 100644 dvc.yaml\n" - ] - } - ], "source": [ - "%%bash\n", + "```bash\n", + "# Commit changes \n", + "\n", "git add .\n", - "git commit -m \"Add stage features_extraction\"" + "git commit -m \"Add stage features_extraction\"\n", + "```" ] }, { @@ -830,49 +620,52 @@ ] }, { - "cell_type": "code", - "execution_count": 109, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:48.044867Z", "start_time": "2020-07-03T19:30:45.984594Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'split_dataset' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'split_dataset' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc run -n split_dataset \\\n", + "```bash\n", + "\n", + "dvc run -n split_dataset \\\n", " -d src/split_dataset.py \\\n", " -d data/iris_featurized.csv \\\n", " -o data/train.csv \\\n", " -o data/test.csv \\\n", " -p featurize,data_split \\\n", " python src/split_dataset.py \\\n", - " --config=params.yaml" + " --config=params.yaml\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-03T19:30:48.250249Z", + "start_time": "2020-07-03T19:30:48.209429Z" + } + }, + "source": [ + "```bash\n", + "# Commit changes\n", + "\n", + "git add .\n", + "git commit -m \"Add stage split_dataset\"\n", + "\n", + "```" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.186864Z", - "start_time": "2020-07-03T19:30:48.068177Z" + "end_time": "2020-10-21T14:23:25.865051Z", + "start_time": "2020-10-21T14:23:25.749060Z" }, "scrolled": true }, @@ -919,31 +712,6 @@ "!cat dvc.yaml" ] }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.250249Z", - "start_time": "2020-07-03T19:30:48.209429Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev e39a9d3] Add stage split_dataset\n", - " 2 files changed, 32 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage split_dataset\"" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -952,48 +720,51 @@ ] }, { - "cell_type": "code", - "execution_count": 112, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:50.298161Z", "start_time": "2020-07-03T19:30:48.275068Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'train' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'train' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock dvc.yaml\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc run -n train \\\n", + "```bash\n", + "\n", + "dvc run -n train \\\n", " -d src/train.py \\\n", " -d data/train.csv \\\n", " -o data/model.joblib \\\n", " -p data_split,train \\\n", " python src/train.py \\\n", - " --config=params.yaml" + " --config=params.yaml\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-03T19:30:50.512656Z", + "start_time": "2020-07-03T19:30:50.468759Z" + } + }, + "source": [ + "```bash\n", + "# Commit changes\n", + "\n", + "git add .\n", + "git commit -m \"Add stage train\"\n", + "\n", + "```" ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 16, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.444828Z", - "start_time": "2020-07-03T19:30:50.324345Z" + "end_time": "2020-10-21T14:23:55.190584Z", + "start_time": "2020-10-21T14:23:55.074531Z" }, "scrolled": true }, @@ -1050,31 +821,6 @@ "!cat dvc.yaml" ] }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.512656Z", - "start_time": "2020-07-03T19:30:50.468759Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev d084d1b] Add stage train\n", - " 2 files changed, 28 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage train\"" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1083,33 +829,17 @@ ] }, { - "cell_type": "code", - "execution_count": 115, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.746281Z", - "start_time": "2020-07-03T19:30:50.546074Z" + "end_time": "2020-10-21T13:03:19.945663Z", + "start_time": "2020-10-21T13:03:19.941005Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Restored stage 'evaluate' from run-cache \n", - "Skipping run, checking out outputs\n", - "Adding stage 'evaluate' in 'dvc.yaml'\n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.yaml dvc.lock\n", - "\u001b[0m" - ] - } - ], "source": [ - "!dvc run -n evaluate \\\n", + "```bash\n", + "\n", + "dvc run -n evaluate \\\n", " -d src/evaluate.py \\\n", " -d data/test.csv \\\n", " -d data/model.joblib \\\n", @@ -1118,16 +848,34 @@ " --plots data/cm.csv \\\n", " -p data_load,data_split,train,evaluate \\\n", " python src/evaluate.py \\\n", - " --config=params.yaml" + " --config=params.yaml\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-03T19:30:52.971253Z", + "start_time": "2020-07-03T19:30:52.919420Z" + } + }, + "source": [ + "```bash\n", + "# Commit changes\n", + "\n", + "git add .\n", + "git commit -m \"Add stage evaluate\"\n", + "```" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 17, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.886914Z", - "start_time": "2020-07-03T19:30:52.769527Z" + "end_time": "2020-10-21T14:24:17.094698Z", + "start_time": "2020-10-21T14:24:16.977100Z" }, "scrolled": true }, @@ -1201,43 +949,6 @@ "!cat dvc.yaml" ] }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.971253Z", - "start_time": "2020-07-03T19:30:52.919420Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[dev ecf5bc5] Add stage evaluate\n", - " 2 files changed, 46 insertions(+)\n" - ] - } - ], - "source": [ - "%%bash\n", - "git add .\n", - "git commit -m \"Add stage evaluate\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-06-28T17:23:10.812463Z", - "start_time": "2020-06-28T17:23:09.886129Z" - } - }, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -1270,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:31:02.889684Z", @@ -1282,7 +993,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", "Stage 'feature_extraction' didn't change, skipping\n", "Stage 'split_dataset' didn't change, skipping\n", "Stage 'train' didn't change, skipping\n", @@ -1294,6 +1005,7 @@ ], "source": [ "# Nothing to reproduce\n", + "\n", "!dvc repro" ] }, @@ -1315,32 +1027,20 @@ ] }, { - "cell_type": "code", - "execution_count": 119, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:31:05.089755Z", "start_time": "2020-07-03T19:31:04.832150Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp1-ratio-features'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - "* \u001b[32mexp1-ratio-features\u001b[m\n", - " master\u001b[m\n" - ] - } - ], "source": [ - "# create new branch\n", + "```bash\n", + "# Create new branch\n", "\n", - "!git checkout -b exp1-ratio-features\n", - "!git branch" + "git checkout -b exp1-ratio-features\n", + "git branch\n", + "```" ] }, { @@ -1377,11 +1077,11 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 21, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.674990Z", - "start_time": "2020-07-03T19:31:25.527004Z" + "end_time": "2020-10-21T14:26:16.419175Z", + "start_time": "2020-10-21T14:26:13.569917Z" }, "scrolled": false }, @@ -1390,21 +1090,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Running stage 'feature_extraction' with command:\n", - "\tpython src/featurization.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", + "Stage 'feature_extraction' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock'\n", "\n", - "Restored stage 'split_dataset' from run-cache\n", - "Skipping run, checking out outputs\n", + "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", "Updating lock file 'dvc.lock'\n", "\n", - "Restored stage 'train' from run-cache\n", - "Skipping run, checking out outputs\n", + "Stage 'train' is cached - skipping run, checking out outputs\n", "Updating lock file 'dvc.lock'\n", "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", + "Stage 'evaluate' is cached - skipping run, checking out outputs\n", "Updating lock file 'dvc.lock'\n", "\n", "To track the changes with git, run:\n", @@ -1420,11 +1116,11 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 22, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.713726Z", - "start_time": "2020-07-03T19:31:28.699701Z" + "end_time": "2020-10-21T14:26:17.593890Z", + "start_time": "2020-10-21T14:26:17.580883Z" } }, "outputs": [ @@ -1529,7 +1225,7 @@ "4 1.388889 7.0 " ] }, - "execution_count": 121, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1545,11 +1241,11 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 23, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:31:28.867945Z", - "start_time": "2020-07-03T19:31:28.737094Z" + "end_time": "2020-10-21T14:26:20.852572Z", + "start_time": "2020-10-21T14:26:20.724853Z" } }, "outputs": [ @@ -1561,6 +1257,7 @@ "Changes not staged for commit:\r\n", " (use \"git add ...\" to update what will be committed)\r\n", " (use \"git restore ...\" to discard changes in working directory)\r\n", + "\t\u001b[31mmodified: dvc-3-automate-experiments.ipynb\u001b[m\r\n", "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n", "\t\u001b[31mmodified: src/featurization.py\u001b[m\r\n", "\r\n", @@ -1574,11 +1271,11 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 24, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:31:36.736663Z", - "start_time": "2020-07-03T19:31:35.023151Z" + "end_time": "2020-10-21T14:26:28.905617Z", + "start_time": "2020-10-21T14:26:26.333849Z" } }, "outputs": [ @@ -1586,8 +1283,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 0.15385 0.0\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 0.15385 0.0\n", "\u001b[0m" ] } @@ -1598,29 +1295,29 @@ ] }, { - "cell_type": "code", - "execution_count": 125, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commit the experiment changes" + ] + }, + { + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:31:39.838836Z", "start_time": "2020-07-03T19:31:39.445353Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[exp1-ratio-features 1fc8ec3] Experiment with new features\n", - " 3 files changed, 872 insertions(+), 510 deletions(-)\n", - "fatal: tag 'exp1_ratio_features' already exists\n" - ] - } - ], "source": [ - "!git add .\n", - "!git commit -m \"Experiment with new features\"\n", - "!git tag -a \"exp1_ratio_features\" -m \"Experiment with new features\"" + "```bash\n", + "# Commit changes\n", + "\n", + "git add .\n", + "git commit -m \"Experiment with new features\"\n", + "git tag -a \"exp1_ratio_features\" -m \"Experiment with new features\"\n", + "\n", + "```" ] }, { @@ -1638,38 +1335,25 @@ ] }, { - "cell_type": "code", - "execution_count": 127, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:32:43.387938Z", "start_time": "2020-07-03T19:32:43.131917Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp2-tuning-logreg'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - " exp1-ratio-features\u001b[m\n", - "* \u001b[32mexp2-tuning-logreg\u001b[m\n", - " master\u001b[m\n" - ] - } - ], "source": [ - "# create new branch for experiment\n", + "```bash\n", + "# Create new branch for experiment\n", "\n", - "!git checkout -b exp2-tuning-logreg\n", - "!git branch" + "git checkout -b exp2-tuning-logreg\n", + "git branch\n", + "```" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:32:52.254763Z", @@ -1681,7 +1365,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", "Stage 'feature_extraction' didn't change, skipping\n", "Stage 'split_dataset' didn't change, skipping\n", "Stage 'train' didn't change, skipping\n", @@ -1708,7 +1392,7 @@ "replace LogisticRegression params with:\n", "\n", "```python\n", - " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n", + " clf = LogisticRegression(C=0.01, solver='lbfgs', multi_class='multinomial', max_iter=100)\n", "```\n", "__Note__: here we changed logistic regresssion hyperparameters: C to 0.1\n", "\n", @@ -1725,11 +1409,11 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 25, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:33:22.746410Z", - "start_time": "2020-07-03T19:33:19.314933Z" + "end_time": "2020-10-21T14:27:55.449989Z", + "start_time": "2020-10-21T14:27:50.294369Z" }, "scrolled": false }, @@ -1738,16 +1422,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", + "Stage 'feature_extraction' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock'\n", + "\n", + "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock'\n", + "\n", "Running stage 'train' with command:\n", "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", + "Updating lock file 'dvc.lock' core\u001b[39m>\n", "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Running stage 'evaluate' with command:\n", + "\tpython src/evaluate.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' core\u001b[39m>\n", "\n", "To track the changes with git, run:\n", "\n", @@ -1757,18 +1445,18 @@ } ], "source": [ - "# re-run pipeline \n", + "# Re-run pipeline \n", "\n", "!dvc repro" ] }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 30, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:33:24.945534Z", - "start_time": "2020-07-03T19:33:24.825464Z" + "end_time": "2020-10-22T14:03:21.593735Z", + "start_time": "2020-10-22T14:03:21.473130Z" } }, "outputs": [ @@ -1776,22 +1464,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"f1_score\": 1.0}" + "{\"f1_score\": 0.9305555555555555}" ] } ], "source": [ "# Get difference with metric from previous pipeline\n", + "\n", "!cat data/metrics.json" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 31, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:34:06.466000Z", - "start_time": "2020-07-03T19:34:05.328958Z" + "end_time": "2020-10-22T14:03:48.640425Z", + "start_time": "2020-10-22T14:03:47.009948Z" } }, "outputs": [ @@ -1799,8 +1488,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", + "\tdata/metrics.json: core\u001b[39m>\n", + "\t\tf1_score: 0.9305555555555555\n", "\u001b[0m" ] } @@ -1811,11 +1500,11 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 32, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:34:08.160934Z", - "start_time": "2020-07-03T19:34:06.494683Z" + "end_time": "2020-10-22T14:03:52.823772Z", + "start_time": "2020-10-22T14:03:50.239522Z" } }, "outputs": [ @@ -1823,8 +1512,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.84615\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 0.93056 0.77671\n", "\u001b[0m" ] } @@ -1837,12 +1526,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Commit" + "### Commit changes" ] }, { - "cell_type": "code", - "execution_count": 137, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:36:15.808072Z", @@ -1850,29 +1538,17 @@ }, "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "On branch exp2-tuning-logreg\n", - "nothing to commit, working tree clean\n" - ] - } - ], "source": [ - "%%bash\n", + "```bash\n", + "# Commit changes\n", "\n", "git add .\n", "git commit -m \"Tune model. LogisticRegression. C=0.1\"\n", - "git tag -a \"exp2_tuning_logreg\" -m \"Tune model. LogisticRegression. C=0.1\"" + "git tag -a \"exp2_tuning_logreg\" -m \"Tune model. LogisticRegression. C=0.01\"\n", + "\n", + "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -1893,32 +1569,19 @@ ] }, { - "cell_type": "code", - "execution_count": 138, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:36:20.443851Z", "start_time": "2020-07-03T19:36:20.187021Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to a new branch 'exp3-svm'\n", - " dev\u001b[m\n", - " dvc-tutorial\u001b[m\n", - " exp1-ratio-features\u001b[m\n", - " exp2-tuning-logreg\u001b[m\n", - "* \u001b[32mexp3-svm\u001b[m\n", - " master\u001b[m\n" - ] - } - ], "source": [ - "!git checkout -b exp3-svm\n", - "!git branch" + "```bash\n", + "# Create a new experiment branch \n", + "\n", + "git checkout -b exp3-svm\n", + "```" ] }, { @@ -1949,7 +1612,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:36:35.537208Z", @@ -1961,16 +1624,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' didn't change, skipping \n", + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", "Stage 'feature_extraction' didn't change, skipping\n", "Stage 'split_dataset' didn't change, skipping\n", "Running stage 'train' with command:\n", "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", + "Updating lock file 'dvc.lock' core\u001b[39m>\n", "\n", - "Restored stage 'evaluate' from run-cache\n", - "Skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Running stage 'evaluate' with command:\n", + "\tpython src/evaluate.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' core\u001b[39m>\n", "\n", "To track the changes with git, run:\n", "\n", @@ -1985,11 +1648,11 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 33, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:36:38.995561Z", - "start_time": "2020-07-03T19:36:37.831841Z" + "end_time": "2020-10-22T14:05:25.052314Z", + "start_time": "2020-10-22T14:05:23.390114Z" } }, "outputs": [ @@ -1997,8 +1660,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", + "\tdata/metrics.json: core\u001b[39m>\n", + "\t\tf1_score: 0.9665831244778613\n", "\u001b[0m" ] } @@ -2009,11 +1672,11 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 34, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:36:40.521084Z", - "start_time": "2020-07-03T19:36:40.392754Z" + "end_time": "2020-10-22T14:05:35.904182Z", + "start_time": "2020-10-22T14:05:35.779856Z" } }, "outputs": [ @@ -2037,28 +1700,28 @@ ] }, { - "cell_type": "code", - "execution_count": 142, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commit changes" + ] + }, + { + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:36:41.766798Z", "start_time": "2020-07-03T19:36:41.377185Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[exp3-svm 1474ec0] Experiment 3 with SVM estimator\r\n", - " 2 files changed, 5 insertions(+), 4 deletions(-)\r\n" - ] - } - ], "source": [ - "!git add .\n", - "!git commit -m \"Experiment 3 with SVM estimator\"\n", - "!git tag -a \"exp3_svm\" -m \"Experiment 3 with SVM estimator\"" + "```bash\n", + "# Commit changes\n", + "\n", + "git add .\n", + "git commit -m \"Experiment 3 with SVM estimator\"\n", + "git tag -a \"exp3_svm\" -m \"Experiment 3 with SVM estimator\"\n", + "```" ] }, { @@ -2069,8 +1732,7 @@ ] }, { - "cell_type": "code", - "execution_count": 153, + "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:45:17.537969Z", @@ -2078,51 +1740,13 @@ }, "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Auto-merging src/train.py\n", - "CONFLICT (content): Merge conflict in src/train.py\n", - "Auto-merging src/featurization.py\n", - "CONFLICT (add/add): Merge conflict in dvc.lock\n", - "Auto-merging dvc.lock\n", - "Auto-merging dvc-3-automate-experiments.ipynb\n", - "CONFLICT (content): Merge conflict in dvc-3-automate-experiments.ipynb\n", - "Automatic merge failed; fix conflicts and then commit the result.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Previous HEAD position was 1474ec0 Experiment 3 with SVM estimator\n", - "Switched to branch 'dvc-tutorial'\n" - ] - }, - { - "ename": "CalledProcessError", - "evalue": "Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2352\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\ngit checkout dvc-tutorial \\ngit merge exp3_svm\\n'' returned non-zero exit status 1." - ] - } - ], "source": [ - "%%bash\n", + "```bash\n", + "# Merge the best experiment\n", "\n", "git checkout dvc-tutorial \n", - "git merge exp3_svm" + "git merge exp3_svm\n", + "```" ] }, { @@ -2141,11 +1765,11 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 36, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:39:20.728429Z", - "start_time": "2020-07-03T19:39:19.065249Z" + "end_time": "2020-10-22T14:07:30.189016Z", + "start_time": "2020-10-22T14:07:27.537384Z" } }, "outputs": [ @@ -2153,7 +1777,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[0m " + "\u001b[0m core\u001b[39m>" ] } ], @@ -2165,11 +1789,11 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 37, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:39:29.288964Z", - "start_time": "2020-07-03T19:39:27.598159Z" + "end_time": "2020-10-22T14:07:47.322051Z", + "start_time": "2020-10-22T14:07:44.759864Z" } }, "outputs": [ @@ -2177,7 +1801,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Param Old New \n", + "Path Param Old Newre\u001b[39m>\n", "params.yaml data_load.classes_names_path data/classes.json data/classes.json\n", "params.yaml data_load.raw_data_path data/iris.csv data/iris.csv\n", "params.yaml data_split.test_path data/test.csv data/test.csv\n", @@ -2200,7 +1824,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:09:20.304575Z", @@ -2212,7 +1836,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"params.yaml\": {\"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}}}\n", + "{\"params.yaml\": {\"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"data_load.dummy_param\": {\"old\": \"dummy_value\", \"new\": \"dummy_value\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}, \"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}}}\n", "\u001b[0m" ] } @@ -2223,7 +1847,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:09:27.495017Z", @@ -2238,6 +1862,7 @@ "| Path | Param | Old | New |\n", "|-------------|------------------------------|--------------------------|--------------------------|\n", "| params.yaml | data_load.classes_names_path | data/classes.json | data/classes.json |\n", + "| params.yaml | data_load.dummy_param | dummy_value | dummy_value |\n", "| params.yaml | data_load.raw_data_path | data/iris.csv | data/iris.csv |\n", "| params.yaml | data_split.test_path | data/test.csv | data/test.csv |\n", "| params.yaml | data_split.test_size | 0.2 | 0.2 |\n", @@ -2247,6 +1872,7 @@ "| params.yaml | featurize.features_path | data/iris_featurized.csv | data/iris_featurized.csv |\n", "| params.yaml | featurize.target_column | target | target |\n", "| params.yaml | train.model_path | data/model.joblib | data/model.joblib |\n", + "\n", "\u001b[0m" ] } @@ -2257,11 +1883,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 38, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:00:44.847802Z", - "start_time": "2020-07-03T19:00:44.717758Z" + "end_time": "2020-10-22T14:08:29.319419Z", + "start_time": "2020-10-22T14:08:29.189441Z" } }, "outputs": [ @@ -2269,49 +1895,98 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mcommit 736c92a6eeda6261f528d7a2e2d4db4cb306fa03\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mexp2-svm\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp2_svm\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 336832e6c8c51861d58e258b6cf7bc5ddc750459\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp3_svm\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: Mikhail \r\n", + "Date: Thu Oct 22 17:05:44 2020 +0300\r\n", + "\r\n", + " Experiment 3 with SVM estimator\r\n", + "\r\n", + "\u001b[33mcommit aff5b7f5d143895108b4dac9939a9c0cd06a349d\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp2_tuning_logreg\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp2-tuning-logreg\u001b[m\u001b[33m)\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Fri Jul 3 21:49:25 2020 +0300\r\n", + "Date: Thu Oct 22 17:04:33 2020 +0300\r\n", "\r\n", - " Experiment 2 with SVM estimator\r\n", + " Tune model. LogisticRegression. C=0.1\r\n", "\r\n", - "\u001b[33mcommit 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 7ab2b518063b63742a396ca83ce6a092a260589a\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Fri Jul 3 21:48:42 2020 +0300\r\n", + "Date: Wed Oct 21 17:27:03 2020 +0300\r\n", "\r\n", " Experiment with new features\r\n", "\r\n", - "\u001b[33mcommit 34a0bc667f86c3b5e388bef672eb598b8a6a7788\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 7619688214cc3b9fe3d3b59674c07c12fc134b47\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:35:03 2020 +0300\r\n", + "Date: Wed Oct 21 17:24:13 2020 +0300\r\n", "\r\n", " Add stage evaluate\r\n", "\r\n", - "\u001b[33mcommit 4c45a4ff702106d78bbaf8d356e0e95ca268e05b\u001b[m\r\n", + "\u001b[33mcommit 2a59d083d38b1a15dab6fe4c2b53c00a9d7f0447\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:34:09 2020 +0300\r\n", + "Date: Wed Oct 21 17:23:50 2020 +0300\r\n", "\r\n", " Add stage train\r\n", "\r\n", - "\u001b[33mcommit f41781d2c4855762c4405636491bc014cc00bd20\u001b[m\r\n", + "\u001b[33mcommit 1a908460f95ed339ccdf735a193054b8af0632dc\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:34:00 2020 +0300\r\n", + "Date: Wed Oct 21 17:23:22 2020 +0300\r\n", "\r\n", " Add stage split_dataset\r\n", "\r\n", - "\u001b[33mcommit dbfc854a931baf57ad116f811c2cea39d4fb69a9\u001b[m\r\n", + "\u001b[33mcommit 915ba326763d905b4c3559cb29ec825f48fb11f0\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:33:51 2020 +0300\r\n", + "Date: Wed Oct 21 17:23:04 2020 +0300\r\n", "\r\n", " Add stage features_extraction\r\n", "\r\n", - "\u001b[33mcommit f2859056db4c53e11ba0593388fddd19018d577b\u001b[m\r\n", + "\u001b[33mcommit 0e12b6aab49d3d96570f95950df411ccf00c6326\u001b[m\r\n", "Author: Mikhail \r\n", - "Date: Wed Jul 1 10:32:26 2020 +0300\r\n", + "Date: Wed Oct 21 12:56:45 2020 +0300\r\n", "\r\n", " Initialize DVC\r\n", "\r\n", - "\u001b[33mcommit 1102dc2e3f636b2d37558f95a960c788f3de32ed\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m, \u001b[m\u001b[1;32mdev\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 25cb0004d22fdc923536ac5232e146361e74e90c\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/update-dvc-tutorials\u001b[m\u001b[33m, \u001b[m\u001b[1;32mupdate-dvc-tutorials\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: Alex \r\n", + "Date: Mon Oct 19 18:03:15 2020 +0900\r\n", + "\r\n", + " Update tutorial notebook\r\n", + "\r\n", + "\u001b[33mcommit bf430d9af143db178d6fa39b405437ee9f8ab1f3\u001b[m\r\n", + "Author: Alex \r\n", + "Date: Mon Oct 19 17:36:07 2020 +0900\r\n", + "\r\n", + " Update .gitignore\r\n", + "\r\n", + "\u001b[33mcommit 0ecca2cdab5972ba68f88cc395b2c4ba4dc47dcf\u001b[m\r\n", + "Author: Alex \r\n", + "Date: Mon Oct 19 17:35:42 2020 +0900\r\n", + "\r\n", + " Update documentation\r\n", + "\r\n", + "\u001b[33mcommit c6293c84e8aa3b0e65afaaecc1a3c9b03d6b2dec\u001b[m\r\n", + "Author: Alex \r\n", + "Date: Mon Oct 19 17:35:23 2020 +0900\r\n", + "\r\n", + " Update versions of python libraries\r\n", + "\r\n", + "\u001b[33mcommit 15bd59fe85e1e002d1ea45230dc61f3b9c4dcfe3\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: Alex \r\n", + "Date: Tue Aug 18 12:45:16 2020 +0900\r\n", + "\r\n", + " Update DVC -> 1.5.0\r\n", + "\r\n", + "\u001b[33mcommit 3573af273082ae1cad1b141131eb824e314eee43\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdev\u001b[m\u001b[33m)\u001b[m\r\n", + "Merge: aeecba0 1102dc2\r\n", + "Author: Mikhail \r\n", + "Date: Sat Jul 4 07:23:25 2020 +0300\r\n", + "\r\n", + " Updated evaluate and params\r\n", + "\r\n", + "\u001b[33mcommit aeecba0880f016303eaa20181d27b60bd3ceb388\u001b[m\r\n", + "Author: Mikhail \r\n", + "Date: Sat Jul 4 07:16:31 2020 +0300\r\n", + "\r\n", + " Update experimeting and metrics section\r\n", + "\r\n", + "\u001b[33mcommit 1102dc2e3f636b2d37558f95a960c788f3de32ed\u001b[m\r\n", "Merge: 855c61a 92ac211\r\n", "Author: Mikhail \r\n", "Date: Wed Jul 1 07:22:32 2020 +0000\r\n", @@ -2426,11 +2101,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 41, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:01:12.974894Z", - "start_time": "2020-07-03T19:01:11.320625Z" + "end_time": "2020-10-22T14:08:58.251510Z", + "start_time": "2020-10-22T14:08:55.660152Z" } }, "outputs": [ @@ -2438,13 +2113,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[0m " + "\u001b[0m core\u001b[39m>" ] } ], "source": [ - "\n", - "!dvc params diff 24f75fdcc9bede20cbecf88697b5d3f8ed56f58c HEAD^" + "!dvc params diff 7619688214cc3b9fe3d3b59674c07c12fc134b47 HEAD^" ] }, { @@ -2456,11 +2130,11 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 42, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:42:07.828077Z", - "start_time": "2020-07-03T19:42:06.658092Z" + "end_time": "2020-10-22T14:09:14.039904Z", + "start_time": "2020-10-22T14:09:12.356063Z" }, "scrolled": true }, @@ -2469,8 +2143,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: \n", - "\t\tf1_score: 1.0\n", + "\tdata/metrics.json: core\u001b[39m>\n", + "\t\tf1_score: 0.9665831244778613\n", "\u001b[0m" ] } @@ -2483,11 +2157,11 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 43, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:42:10.492627Z", - "start_time": "2020-07-03T19:42:09.201160Z" + "end_time": "2020-10-22T14:09:18.038960Z", + "start_time": "2020-10-22T14:09:16.234043Z" } }, "outputs": [ @@ -2495,30 +2169,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "workspace: \n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "dev:\n", + "dvc-tutorial: core\u001b[39m>\n", "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", - "dvc-tutorial:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9305555555555555\n", + "\t\tf1_score: 0.9665831244778613\n", "exp1-ratio-features:\n", "\tdata/metrics.json:\n", "\t\tf1_score: 0.15384615384615383\n", "exp2-tuning-logreg:\n", "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", + "\t\tf1_score: 0.9305555555555555\n", "exp3-svm:\n", "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", + "\t\tf1_score: 0.9665831244778613\n", + "exp1_ratio_features:\n", + "\tdata/metrics.json:\n", + "\t\tf1_score: 0.15384615384615383\n", "exp2_tuning_logreg:\n", "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", + "\t\tf1_score: 0.9305555555555555\n", "exp3_svm:\n", "\tdata/metrics.json:\n", - "\t\tf1_score: 1.0\n", + "\t\tf1_score: 0.9665831244778613\n", "\u001b[0m" ] } @@ -2538,11 +2209,11 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 44, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:43:27.774038Z", - "start_time": "2020-07-03T19:43:26.104962Z" + "end_time": "2020-10-22T14:09:38.852532Z", + "start_time": "2020-10-22T14:09:36.162510Z" } }, "outputs": [ @@ -2550,7 +2221,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[0m " + "\u001b[0m core\u001b[39m>" ] } ], @@ -2560,11 +2231,11 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 45, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:44:46.444858Z", - "start_time": "2020-07-03T19:44:44.738955Z" + "end_time": "2020-10-22T14:09:41.557718Z", + "start_time": "2020-10-22T14:09:38.912543Z" } }, "outputs": [ @@ -2572,8 +2243,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.0\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.96658 0.96658 0.0\n", "\u001b[0m" ] } @@ -2593,11 +2264,11 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 46, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:11:04.120125Z", - "start_time": "2020-07-03T19:11:02.460457Z" + "end_time": "2020-10-22T14:09:57.211470Z", + "start_time": "2020-10-22T14:09:54.710817Z" } }, "outputs": [ @@ -2605,8 +2276,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 1.0 0.84615\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 0.96658 0.81274\n", "\u001b[0m" ] } @@ -2614,17 +2285,16 @@ "source": [ "# Compare old and new branches\n", "\n", - "\n", - "!dvc metrics diff exp1-ratio-features exp2-svm" + "!dvc metrics diff exp1-ratio-features exp3-svm" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 47, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:10:59.357203Z", - "start_time": "2020-07-03T19:10:57.708759Z" + "end_time": "2020-10-22T14:10:03.187146Z", + "start_time": "2020-10-22T14:10:00.568957Z" } }, "outputs": [ @@ -2632,8 +2302,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Value Change \n", - "data/metrics.json f1_score 0.93056 0.77671\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 0.96658 0.81274\n", "\u001b[0m" ] } @@ -2646,72 +2316,11 @@ }, { "cell_type": "code", - "execution_count": 157, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:50:29.269796Z", - "start_time": "2020-07-03T19:50:29.132897Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Switched to branch 'dev'\r\n", - "Your branch is ahead of 'origin/dev' by 7 commits.\r\n", - " (use \"git push\" to publish your local commits)\r\n" - ] - } - ], - "source": [ - "!git checkout dev -f" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* чтобы выводить не только новую, но и старую метрики, нужно добавить опцию --old" - ] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:48:02.485718Z", - "start_time": "2020-07-03T19:48:01.562562Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR\u001b[39m: failed to show metrics diff - unable to read: 'dvc.lock', YAML file structure is corrupted: while scanning a simple key\n", - " in \"\", line 22, column 1\n", - "could not find expected ':'\n", - " in \"\", line 23, column 8\n", - "\n", - "\u001b[33mHaving any troubles?\u001b[39m Hit us up at \u001b[34mhttps://dvc.org/support\u001b[39m, we are always happy to help!\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Use --old to show both old and new metrics vlues \n", - "\n", - "!dvc metrics diff --old exp1-ratio-features exp2-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 158, + "execution_count": 49, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:50:33.253819Z", - "start_time": "2020-07-03T19:50:31.570404Z" + "end_time": "2020-10-22T14:10:38.911976Z", + "start_time": "2020-10-22T14:10:36.304792Z" }, "scrolled": true }, @@ -2720,14 +2329,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "| Path | Metric | Value | Change | \n", - "|--------|----------|---------|----------|\n", + "| Path | Metric | Old | New | Change | core\u001b[39m>\n", + "|-------------------|----------|---------|---------|----------|\n", + "| data/metrics.json | f1_score | 0.15385 | 0.96658 | 0.81274 |\n", + "\n", "\u001b[0m" ] } ], "source": [ - "!dvc metrics diff --show-md" + "!dvc metrics diff exp1-ratio-features --show-md" ] }, { @@ -2746,11 +2357,11 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 50, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T20:08:18.872602Z", - "start_time": "2020-07-03T20:08:18.869605Z" + "end_time": "2020-10-22T14:11:04.697127Z", + "start_time": "2020-10-22T14:11:04.694448Z" } }, "outputs": [], @@ -2767,11 +2378,11 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 60, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T20:10:21.387140Z", - "start_time": "2020-07-03T20:10:20.271263Z" + "end_time": "2020-10-22T14:15:07.599224Z", + "start_time": "2020-10-22T14:15:05.455604Z" } }, "outputs": [ @@ -2779,7 +2390,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-show.html\n", + "file:///Users/mnrozhkov/dev/dvc/dvc-3-automate-experiments/data/plots-show.html\n", "\u001b[0m" ] } @@ -2790,11 +2401,11 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 61, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T20:10:21.421474Z", - "start_time": "2020-07-03T20:10:21.416923Z" + "end_time": "2020-10-22T14:15:07.664691Z", + "start_time": "2020-10-22T14:15:07.660138Z" }, "scrolled": false }, @@ -2804,7 +2415,7 @@ "text/html": [ "\n", " " + "" ] }, - "execution_count": 177, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "IFrame(src='data/plots-show.html', width=500, height=500)" + "IFrame(src='data/plots-show.html', width=800, height=500)" ] }, { @@ -2834,11 +2445,11 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 62, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T20:27:04.674839Z", - "start_time": "2020-07-03T20:27:03.879598Z" + "end_time": "2020-10-22T14:15:16.499865Z", + "start_time": "2020-10-22T14:15:15.626027Z" } }, "outputs": [ @@ -2846,7 +2457,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "file:///Users/mnrozhkov/dev/dvc/course/dvc-3-automate-experiments/data/plots-diff.html\n", + "file:///Users/mnrozhkov/dev/dvc/dvc-3-automate-experiments/data/plots-diff.html\n", "\u001b[0m" ] } @@ -2858,11 +2469,11 @@ }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 64, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T20:27:34.434387Z", - "start_time": "2020-07-03T20:27:34.430369Z" + "end_time": "2020-10-22T14:15:23.321541Z", + "start_time": "2020-10-22T14:15:23.317724Z" }, "scrolled": false }, @@ -2872,8 +2483,8 @@ "text/html": [ "\n", " " + "" ] }, - "execution_count": 194, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "IFrame(src='data/plots-diff.html', width=1000, height=400)" + "IFrame(src='data/plots-diff.html', width=800, height=500)" ] }, { @@ -2917,7 +2528,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.3" }, "toc": { "base_numbering": 1, @@ -2932,7 +2543,7 @@ "height": "calc(100% - 180px)", "left": "10px", "top": "150px", - "width": "230.953px" + "width": "230.947px" }, "toc_section_display": true, "toc_window_display": true diff --git a/requirements.txt b/requirements.txt index d470460..dd287b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -joblib==0.15.1 +dvc==1.8.4 +joblib==0.17.0 jupyter==1.0.0 jupyter_contrib_nbextensions==0.5.1 -ipykernel==5.3.0 -matplotlib==3.1.2 -numpy==1.18.1 -pandas==1.0.0 -pyyaml==5.3 -scikit-learn==0.23.1 -scipy==1.4.1 -tqdm==4.42.0 \ No newline at end of file +ipykernel==5.3.4 +matplotlib==3.3.2 +numpy==1.19.2 +pandas==1.1.3 +pyyaml==5.3.1 +scikit-learn==0.23.2 +scipy==1.5.3 +tqdm==4.50.2 \ No newline at end of file diff --git a/src/evaluate.py b/src/evaluate.py index bc0d609..ba65d5d 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -36,12 +36,6 @@ def evaluate(config_path: Text) -> None: fp=open(metrics_path, 'w') ) - # pd.DataFrame({'actual': y, 'predicted': prediction}).apply( - # lambda series: series.map( - # {i: cls_name for i, cls_name in enumerate(classes)} - # ) - # ).to_csv(confusion_matrix_path, index=False) - mapping = {i: cls_name for i, cls_name in enumerate(classes)} cmdf = pd.DataFrame( {'actual': y, 'predicted': prediction} From 3a35f44c72841ab7e9a02ae0fed85141104e08f9 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Mon, 15 Mar 2021 22:44:36 +0900 Subject: [PATCH 3/8] Update dependencies --- requirements.txt | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index dd287b3..613946b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ -dvc==1.8.4 -joblib==0.17.0 +dvc==2.0.5 +joblib==1.0.1 jupyter==1.0.0 jupyter_contrib_nbextensions==0.5.1 -ipykernel==5.3.4 -matplotlib==3.3.2 -numpy==1.19.2 -pandas==1.1.3 -pyyaml==5.3.1 -scikit-learn==0.23.2 -scipy==1.5.3 -tqdm==4.50.2 \ No newline at end of file +ipykernel==5.5.0 +matplotlib==3.3.4 +numpy==1.20.1 +pandas==1.2.3 +python-box==5.3.0 +pyyaml==5.4.1 +scikit-learn==0.24.1 +scipy==1.6.1 +tqdm==4.59.0 \ No newline at end of file From 875d1963ff4a87bb9d3ae691ad3e80b8b985eeb0 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Mon, 15 Mar 2021 22:45:17 +0900 Subject: [PATCH 4/8] Read config using python-box --- README.md | 1 + src/data_load.py | 11 +++++------ src/evaluate.py | 21 ++++++++------------- src/featurization.py | 11 +++++------ src/split_dataset.py | 18 +++++++----------- src/train.py | 13 ++++++------- src/utils.py | 19 +++++++++++++++++++ 7 files changed, 51 insertions(+), 43 deletions(-) create mode 100644 src/utils.py diff --git a/README.md b/README.md index 705a66a..b134091 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ cd dvc-3-automate-experiments Create virtual environment named `dvc` (you may use other name) ```bash python3 -m venv dvc-venv +echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate source dvc-venv/bin/activate ``` diff --git a/src/data_load.py b/src/data_load.py index b07a825..0400519 100644 --- a/src/data_load.py +++ b/src/data_load.py @@ -2,7 +2,8 @@ import json from sklearn.datasets import load_iris from typing import Text -import yaml + +from src.utils import load_config def data_load(config_path: Text) -> None: @@ -12,18 +13,16 @@ def data_load(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - raw_data_path = config['data_load']['raw_data_path'] - classes_names_path = config['data_load']['classes_names_path'] + config = load_config(config_path) data = load_iris(as_frame=True) classes_names = data.target_names.tolist() dataset = data.frame dataset.columns = [colname.strip(' (cm)').replace(' ', '_') for colname in dataset.columns.tolist()] - dataset.to_csv(raw_data_path, index=False) + dataset.to_csv(config.data_load.raw_data_path, index=False) - with open(classes_names_path, 'w') as classes_names_file: + with open(config.data_load.classes_names_path, 'w') as classes_names_file: json.dump(obj={'classes_names': classes_names}, fp=classes_names_file) diff --git a/src/evaluate.py b/src/evaluate.py index ba65d5d..d9c4e42 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -4,7 +4,8 @@ import pandas as pd from sklearn.metrics import f1_score from typing import Text -import yaml + +from src.utils import load_config def evaluate(config_path: Text) -> None: @@ -13,34 +14,29 @@ def evaluate(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - classes_names_path = config['data_load']['classes_names_path'] - test_dataset_path = config['data_split']['test_path'] - model_path = config['train']['model_path'] - metrics_path = config['evaluate']['metrics_file'] - confusion_matrix_path = config['evaluate']['confusion_matrix'] + config = load_config(config_path) - classes = json.load(open(classes_names_path))['classes_names'] + classes = json.load(open(config.data_load.classes_names_path))['classes_names'] - test_dataset = pd.read_csv(test_dataset_path) + test_dataset = pd.read_csv(config.data_split.test_path) y = test_dataset.loc[:, 'target'].values.astype('float32') X = test_dataset.drop('target', axis=1).values - clf = joblib.load(model_path) + clf = joblib.load(config.train.model_path) prediction = clf.predict(X) f1 = f1_score(y_true=y, y_pred=prediction, average='macro') json.dump( obj={'f1_score': f1}, - fp=open(metrics_path, 'w') + fp=open(config.evaluate.metrics_file, 'w') ) mapping = {i: cls_name for i, cls_name in enumerate(classes)} cmdf = pd.DataFrame( {'actual': y, 'predicted': prediction} ).apply(lambda series: series.map(mapping)) - cmdf.to_csv(confusion_matrix_path, index=False) + cmdf.to_csv(config.evaluate.confusion_matrix, index=False) if __name__ == '__main__': @@ -50,4 +46,3 @@ def evaluate(config_path: Text) -> None: args = args_parser.parse_args() evaluate(config_path=args.config) - diff --git a/src/featurization.py b/src/featurization.py index 2306f34..9aea31a 100644 --- a/src/featurization.py +++ b/src/featurization.py @@ -1,7 +1,8 @@ import argparse import pandas as pd from typing import Text -import yaml + +from src.utils import load_config def get_features(dataset): @@ -17,13 +18,11 @@ def featurize(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - raw_data_path = config['data_load']['raw_data_path'] - featurized_dataset_path = config['featurize']['features_path'] + config = load_config(config_path) - dataset = pd.read_csv(raw_data_path) + dataset = pd.read_csv(config.data_load.raw_data_path) features = get_features(dataset) - features.to_csv(featurized_dataset_path, index=False) + features.to_csv(config.featurize.features_path, index=False) if __name__ == '__main__': diff --git a/src/split_dataset.py b/src/split_dataset.py index ffd6f11..8b8bd38 100644 --- a/src/split_dataset.py +++ b/src/split_dataset.py @@ -2,7 +2,8 @@ from sklearn.model_selection import train_test_split import pandas as pd from typing import Text -import yaml + +from src.utils import load_config def split_train_test(config_path: Text) -> None: @@ -11,20 +12,15 @@ def split_train_test(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - featurized_dataset_path = config['featurize']['features_path'] - train_dataset_path = config['data_split']['train_path'] - test_dataset_path = config['data_split']['test_path'] - test_size = config['data_split']['test_size'] - - dataset = pd.read_csv(featurized_dataset_path) + config = load_config(config_path) + dataset = pd.read_csv(config.featurize.features_path) # Split in train/test - + test_size = config.data_split.test_size df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=42) - df_train.to_csv(train_dataset_path, index=False) - df_test.to_csv(test_dataset_path, index=False) + df_train.to_csv(config.data_split.train_path, index=False) + df_test.to_csv(config.data_split.test_path, index=False) if __name__ == '__main__': diff --git a/src/train.py b/src/train.py index fefd056..56f39c7 100644 --- a/src/train.py +++ b/src/train.py @@ -4,7 +4,8 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from typing import Text -import yaml + +from src.utils import load_config def train(config_path: Text) -> None: @@ -13,11 +14,9 @@ def train(config_path: Text) -> None: config_path {Text}: path to config """ - config = yaml.safe_load(open(config_path)) - train_dataset_path = config['data_split']['train_path'] - model_path = config['train']['model_path'] + config = load_config(config_path) # Load train set - train_dataset = pd.read_csv(train_dataset_path) + train_dataset = pd.read_csv(config.data_split.train_path) # Get X and Y y = train_dataset.loc[:, 'target'].values.astype('float32') @@ -27,7 +26,7 @@ def train(config_path: Text) -> None: clf = LogisticRegression(C=0.00001, solver='lbfgs', multi_class='multinomial', max_iter=100) clf.fit(X, y) - joblib.dump(clf, model_path) + joblib.dump(clf, config.train.model_path) if __name__ == '__main__': @@ -36,4 +35,4 @@ def train(config_path: Text) -> None: args_parser.add_argument('--config', dest='config', required=True) args = args_parser.parse_args() - train(config_path=args.config) \ No newline at end of file + train(config_path=args.config) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..ce4b6a5 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,19 @@ +import box +from typing import Text +import yaml + + +def load_config(config_path: Text) -> box.ConfigBox: + """Loads yaml config in instance of box.ConfigBox. + Args: + config_path {Text}: path to config + Returns: + box.ConfigBox + """ + + with open(config_path) as config_file: + + config = yaml.safe_load(config_file) + config = box.ConfigBox(config) + + return config From 7a5671c45a72874c35ea5e3695123f1945d96182 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Mon, 15 Mar 2021 23:36:39 +0900 Subject: [PATCH 5/8] Update tutorial notebook --- dvc-3-automate-experiments.ipynb | 1325 +++++++++++++++--------------- 1 file changed, 646 insertions(+), 679 deletions(-) diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb index 32c7c91..4661934 100644 --- a/dvc-3-automate-experiments.ipynb +++ b/dvc-3-automate-experiments.ipynb @@ -92,6 +92,58 @@ "# Build automated pipelines" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## params.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-03T19:30:37.877998Z", + "start_time": "2020-07-03T19:30:37.755666Z" + }, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + "data_load:\r\n", + " raw_data_path: data/iris.csv\r\n", + " classes_names_path: data/classes.json\r\n", + "\r\n", + "featurize:\r\n", + " features_path: data/iris_featurized.csv\r\n", + " target_column: target\r\n", + "\r\n", + "\r\n", + "data_split:\r\n", + " test_size: 0.2\r\n", + " train_path: data/train.csv\r\n", + " test_path: data/test.csv\r\n", + "\r\n", + "\r\n", + "train:\r\n", + " model_path: data/model.joblib\r\n", + "\r\n", + "\r\n", + "evaluate:\r\n", + " metrics_file: data/metrics.json\r\n", + " confusion_matrix: data/cm.csv\r\n" + ] + } + ], + "source": [ + "!cat params.yaml" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -127,7 +179,8 @@ "```bash\n", "# Create data_load pipeline stage\n", "\n", - "dvc run -n data_load \\\n", + "dvc stage add \\\n", + " -n data_load \\\n", " -d src/data_load.py \\\n", " -o data/iris.csv \\\n", " -o data/classes.json \\\n", @@ -140,37 +193,11 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T13:42:01.176530Z", - "start_time": "2020-10-21T13:42:01.147399Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4.0K\tdata/classes.json\n", - "4.0K\tdata/iris.csv\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "du -sh data/*" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-10-21T10:03:42.595865Z", - "start_time": "2020-10-21T10:03:42.471874Z" + "end_time": "2020-07-03T19:30:37.727096Z", + "start_time": "2020-07-03T19:30:37.609182Z" } }, "outputs": [ @@ -178,43 +205,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[01;34m.\u001b[00m\r\n", - "├── README.md\r\n", - "├── dvc-3-automate-experiments.ipynb\r\n", - "├── params.yaml\r\n", - "├── requirements.txt\r\n", - "└── \u001b[01;34msrc\u001b[00m\r\n", - " ├── __init__.py\r\n", - " ├── data_load.py\r\n", - " ├── evaluate.py\r\n", - " ├── featurization.py\r\n", - " ├── split_dataset.py\r\n", - " └── train.py\r\n", - "\r\n", - "1 directory, 10 files\r\n" + "stages:\r\n", + " data_load:\r\n", + " cmd: python src/data_load.py --config=params.yaml\r\n", + " deps:\r\n", + " - src/data_load.py\r\n", + " params:\r\n", + " - data_load\r\n", + " outs:\r\n", + " - data/classes.json\r\n", + " - data/iris.csv\r\n" ] } ], "source": [ - "# Note: we use `tree -I ...` pattern to not list those files that match the wild-card pattern.\n", - "\n", - "!tree -I dvc-venv" + "!cat dvc.yaml" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## dvc.yaml" + "## Reproduce a pipeline" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.727096Z", - "start_time": "2020-07-03T19:30:37.609182Z" + "end_time": "2020-07-03T19:30:39.781553Z", + "start_time": "2020-07-03T19:30:37.923002Z" } }, "outputs": [ @@ -222,37 +243,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n" + "Running stage 'data_load': core\u001b[39m>\n", + "> python src/data_load.py --config=params.yaml\n", + "Generating lock file 'dvc.lock' \n", + "Updating lock file 'dvc.lock'\n", + "\n", + "To track the changes with git, run:\n", + "\n", + "\tgit add dvc.lock\n", + "Use `dvc push` to send your updates to remote storage.\n", + "\u001b[0m" ] } ], "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## params.yaml" + "!dvc repro" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.877998Z", - "start_time": "2020-07-03T19:30:37.755666Z" + "end_time": "2020-10-21T13:42:01.176530Z", + "start_time": "2020-10-21T13:42:01.147399Z" }, "scrolled": true }, @@ -261,50 +275,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "\r\n", - "data_load:\r\n", - " raw_data_path: data/iris.csv\r\n", - " classes_names_path: data/classes.json\r\n", - "\r\n", - "featurize:\r\n", - " features_path: data/iris_featurized.csv\r\n", - " target_column: target\r\n", - "\r\n", - "\r\n", - "data_split:\r\n", - " test_size: 0.2\r\n", - " train_path: data/train.csv\r\n", - " test_path: data/test.csv\r\n", - "\r\n", - "\r\n", - "train:\r\n", - " model_path: data/model.joblib\r\n", - "\r\n", - "\r\n", - "evaluate:\r\n", - " metrics_file: data/metrics.json\r\n", - " confusion_matrix: data/cm.csv\r\n" + "4,0K\tdata/classes.json\n", + "4,0K\tdata/cm.csv\n", + "4,0K\tdata/iris.csv\n", + "4,0K\tdata/iris_featurized.csv\n", + "4,0K\tdata/metrics.json\n", + "4,0K\tdata/model.joblib\n", + "4,0K\tdata/test.csv\n", + "4,0K\tdata/train.csv\n" ] } ], "source": [ - "!cat params.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reproduce a pipeline" + "%%bash\n", + "\n", + "du -sh data/*" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-07-03T19:30:39.781553Z", - "start_time": "2020-07-03T19:30:37.923002Z" + "end_time": "2020-10-21T10:03:42.595865Z", + "start_time": "2020-10-21T10:03:42.471874Z" } }, "outputs": [ @@ -312,13 +306,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Stage 'data_load' is cached - skipping run, checking out outputs core\u001b[39m>\n", - "\u001b[0m" + "\u001b[01;34m.\u001b[00m\r\n", + "├── \u001b[01;34mdata\u001b[00m\r\n", + "│   ├── classes.json\r\n", + "│   ├── cm.csv\r\n", + "│   ├── iris.csv\r\n", + "│   ├── iris_featurized.csv\r\n", + "│   ├── metrics.json\r\n", + "│   ├── model.joblib\r\n", + "│   ├── test.csv\r\n", + "│   └── train.csv\r\n", + "├── dvc-3-automate-experiments.ipynb\r\n", + "├── dvc.lock\r\n", + "├── dvc.yaml\r\n", + "├── params.yaml\r\n", + "├── README.md\r\n", + "├── requirements.txt\r\n", + "└── \u001b[01;34msrc\u001b[00m\r\n", + " ├── data_load.py\r\n", + " ├── evaluate.py\r\n", + " ├── featurization.py\r\n", + " ├── __init__.py\r\n", + " ├── \u001b[01;34m__pycache__\u001b[00m\r\n", + " │   ├── __init__.cpython-38.pyc\r\n", + " │   └── utils.cpython-38.pyc\r\n", + " ├── split_dataset.py\r\n", + " ├── train.py\r\n", + " └── utils.py\r\n", + "\r\n", + "3 directories, 23 files\r\n" ] } ], "source": [ - "!dvc repro" + "# Note: we use `tree -I ...` pattern to not list those files that match the wild-card pattern.\n", + "\n", + "!tree -I dvc-venv" ] }, { @@ -333,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:30:41.698409Z", @@ -345,13 +368,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running stage 'data_load' with command: core\u001b[39m>\n", - "\tpython src/data_load.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "Running stage 'data_load': core\u001b[39m>\n", + "> python src/data_load.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' \n", "\n", "To track the changes with git, run:\n", "\n", "\tgit add dvc.lock\n", + "Use `dvc push` to send your updates to remote storage.\n", "\u001b[0m" ] } @@ -390,7 +414,8 @@ "source": [ "```bash\n", "\n", - "dvc run -n feature_extraction \\\n", + "dvc stage add \\\n", + " -n feature_extraction \\\n", " -d src/featurization.py \\\n", " -d data/iris.csv \\\n", " -o data/iris_featurized.csv \\\n", @@ -402,53 +427,142 @@ ] }, { - "cell_type": "code", - "execution_count": 12, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add split train/test stage" + ] + }, + { + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-10-21T14:22:48.664322Z", - "start_time": "2020-10-21T14:22:48.539481Z" + "end_time": "2020-07-03T19:30:48.044867Z", + "start_time": "2020-07-03T19:30:45.984594Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "README.md dvc.yaml\r\n", - "\u001b[1m\u001b[36mdata\u001b[m\u001b[m params.yaml\r\n", - "dvc-3-automate-experiments.ipynb requirements.txt\r\n", - "\u001b[1m\u001b[36mdvc-venv\u001b[m\u001b[m \u001b[1m\u001b[36msrc\u001b[m\u001b[m\r\n", - "dvc.lock\r\n" - ] - } - ], "source": [ - "!ls " + "```bash\n", + "\n", + "dvc stage add \\\n", + " -n split_dataset \\\n", + " -d src/split_dataset.py \\\n", + " -d data/iris_featurized.csv \\\n", + " -o data/train.csv \\\n", + " -o data/test.csv \\\n", + " -p featurize,data_split \\\n", + " python src/split_dataset.py \\\n", + " --config=params.yaml\n", + "```" ] }, { - "cell_type": "code", - "execution_count": 13, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add train stage" + ] + }, + { + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-10-21T14:22:55.648732Z", - "start_time": "2020-10-21T14:22:55.526727Z" + "end_time": "2020-07-03T19:30:50.298161Z", + "start_time": "2020-07-03T19:30:48.275068Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", + "source": [ + "```bash\n", + "\n", + "dvc stage add \\\n", + " -n train \\\n", + " -d src/train.py \\\n", + " -d data/train.csv \\\n", + " -o data/model.joblib \\\n", + " -p data_split,train \\\n", + " python src/train.py \\\n", + " --config=params.yaml\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add evaluate stage" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-10-21T13:03:19.945663Z", + "start_time": "2020-10-21T13:03:19.941005Z" + } + }, + "source": [ + "```bash\n", + "\n", + "dvc stage add \\\n", + " -n evaluate \\\n", + " -d src/evaluate.py \\\n", + " -d data/test.csv \\\n", + " -d data/model.joblib \\\n", + " -d data/classes.json \\\n", + " -m data/metrics.json \\\n", + " --plots data/cm.csv \\\n", + " -p data_load,data_split,train,evaluate \\\n", + " python src/evaluate.py \\\n", + " --config=params.yaml\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-10-21T14:22:48.664322Z", + "start_time": "2020-10-21T14:22:48.539481Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data\t\t\t\t dvc.lock dvc.yaml\t README.md\t src\r\n", + "dvc-3-automate-experiments.ipynb dvc-venv params.yaml requirements.txt\r\n" + ] + } + ], + "source": [ + "!ls " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-10-21T14:23:25.865051Z", + "start_time": "2020-10-21T14:23:25.749060Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stages:\r\n", + " data_load:\r\n", + " cmd: python src/data_load.py --config=params.yaml\r\n", + " deps:\r\n", + " - src/data_load.py\r\n", + " params:\r\n", + " - data_load\r\n", + " outs:\r\n", " - data/classes.json\r\n", " - data/iris.csv\r\n", " feature_extraction:\r\n", @@ -460,7 +574,44 @@ " - data_load\r\n", " - featurize\r\n", " outs:\r\n", - " - data/iris_featurized.csv\r\n" + " - data/iris_featurized.csv\r\n", + " split_dataset:\r\n", + " cmd: python src/split_dataset.py --config=params.yaml\r\n", + " deps:\r\n", + " - data/iris_featurized.csv\r\n", + " - src/split_dataset.py\r\n", + " params:\r\n", + " - data_split\r\n", + " - featurize\r\n", + " outs:\r\n", + " - data/test.csv\r\n", + " - data/train.csv\r\n", + " train:\r\n", + " cmd: python src/train.py --config=params.yaml\r\n", + " deps:\r\n", + " - data/train.csv\r\n", + " - src/train.py\r\n", + " params:\r\n", + " - data_split\r\n", + " - train\r\n", + " outs:\r\n", + " - data/model.joblib\r\n", + " evaluate:\r\n", + " cmd: python src/evaluate.py --config=params.yaml\r\n", + " deps:\r\n", + " - data/classes.json\r\n", + " - data/model.joblib\r\n", + " - data/test.csv\r\n", + " - src/evaluate.py\r\n", + " params:\r\n", + " - data_load\r\n", + " - data_split\r\n", + " - evaluate\r\n", + " - train\r\n", + " metrics:\r\n", + " - data/metrics.json\r\n", + " plots:\r\n", + " - data/cm.csv\r\n" ] } ], @@ -468,9 +619,59 @@ "!cat dvc.yaml" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reproduce DVC pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", + "Running stage 'feature_extraction':\n", + "> python src/featurization.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' \n", + "\n", + "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock' \n", + "\n", + "Stage 'train' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock' \n", + "\n", + "Running stage 'evaluate':\n", + "> python src/evaluate.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' \n", + "\n", + "To track the changes with git, run:\n", + "\n", + "\tgit add dvc.lock\n", + "Use `dvc push` to send your updates to remote storage.\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!dvc repro" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:22:58.124155Z", @@ -560,7 +761,7 @@ "4 5.0 3.6 1.4 0.2 0" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -583,370 +784,33 @@ "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-10-21T12:58:37.280454Z", - "start_time": "2020-10-21T12:58:37.272728Z" - } - }, - "source": [ - "```bash\n", - "# Check Git status\n", - "\n", - "git status -s\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.961182Z", - "start_time": "2020-07-03T19:30:45.916816Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes \n", - "\n", - "git add .\n", - "git commit -m \"Add stage features_extraction\"\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add split train/test stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.044867Z", - "start_time": "2020-07-03T19:30:45.984594Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc run -n split_dataset \\\n", - " -d src/split_dataset.py \\\n", - " -d data/iris_featurized.csv \\\n", - " -o data/train.csv \\\n", - " -o data/test.csv \\\n", - " -p featurize,data_split \\\n", - " python src/split_dataset.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.250249Z", - "start_time": "2020-07-03T19:30:48.209429Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes\n", - "\n", - "git add .\n", - "git commit -m \"Add stage split_dataset\"\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:23:25.865051Z", - "start_time": "2020-10-21T14:23:25.749060Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add train stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.298161Z", - "start_time": "2020-07-03T19:30:48.275068Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc run -n train \\\n", - " -d src/train.py \\\n", - " -d data/train.csv \\\n", - " -o data/model.joblib \\\n", - " -p data_split,train \\\n", - " python src/train.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.512656Z", - "start_time": "2020-07-03T19:30:50.468759Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes\n", - "\n", - "git add .\n", - "git commit -m \"Add stage train\"\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:23:55.190584Z", - "start_time": "2020-10-21T14:23:55.074531Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n", - " train:\r\n", - " cmd: python src/train.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/train.csv\r\n", - " - src/train.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - train\r\n", - " outs:\r\n", - " - data/model.joblib\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add evaluate stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T13:03:19.945663Z", - "start_time": "2020-10-21T13:03:19.941005Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc run -n evaluate \\\n", - " -d src/evaluate.py \\\n", - " -d data/test.csv \\\n", - " -d data/model.joblib \\\n", - " -d data/classes.json \\\n", - " -m data/metrics.json \\\n", - " --plots data/cm.csv \\\n", - " -p data_load,data_split,train,evaluate \\\n", - " python src/evaluate.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:52.971253Z", - "start_time": "2020-07-03T19:30:52.919420Z" + "end_time": "2020-10-21T12:58:37.280454Z", + "start_time": "2020-10-21T12:58:37.272728Z" } }, "source": [ "```bash\n", - "# Commit changes\n", + "# Check Git status\n", "\n", - "git add .\n", - "git commit -m \"Add stage evaluate\"\n", + "git status -s\n", "```" ] }, { - "cell_type": "code", - "execution_count": 17, + "cell_type": "markdown", "metadata": { "ExecuteTime": { - "end_time": "2020-10-21T14:24:17.094698Z", - "start_time": "2020-10-21T14:24:16.977100Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n", - " train:\r\n", - " cmd: python src/train.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/train.csv\r\n", - " - src/train.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - train\r\n", - " outs:\r\n", - " - data/model.joblib\r\n", - " evaluate:\r\n", - " cmd: python src/evaluate.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/classes.json\r\n", - " - data/model.joblib\r\n", - " - data/test.csv\r", - "\r\n", - " - src/evaluate.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - data_split\r\n", - " - evaluate\r\n", - " - train\r\n", - " metrics:\r\n", - " - data/metrics.json\r\n", - " plots:\r\n", - " - data/cm.csv\r\n" - ] + "end_time": "2020-07-03T19:30:45.961182Z", + "start_time": "2020-07-03T19:30:45.916816Z" } - ], + }, "source": [ - "!cat dvc.yaml" + "```bash\n", + "# Commit changes \n", + "\n", + "git add .\n", + "git commit -m \"Complete DVC pipeline\"\n", + "```" ] }, { @@ -981,7 +845,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:31:02.889684Z", @@ -1057,7 +921,7 @@ "in file __featurization.py__ in function`get_features()` after line \n", "\n", "```python\n", - " features = dataset.copy()\n", + " features = get_features(dataset)\n", "```\n", "\n", "add lines:\n", @@ -1072,12 +936,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reproduce pipeline " + "### Create experiment" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:26:16.419175Z", @@ -1092,31 +956,39 @@ "text": [ "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", "Stage 'feature_extraction' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Updating lock file 'dvc.lock' \n", "\n", "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Updating lock file 'dvc.lock' \n", "\n", "Stage 'train' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Updating lock file 'dvc.lock' \n", "\n", "Stage 'evaluate' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Updating lock file 'dvc.lock' \n", "\n", "To track the changes with git, run:\n", "\n", - "\tgit add dvc.lock\n", + "\tgit add dvc.lock src/featurization.py\n", + " core\u001b[39m>\n", + "Reproduced experiment(s): exp1-ratio-features\n", + "Experiment results have been applied to your workspace.\n", + "\n", + "To promote an experiment to a Git branch run:\n", + "\n", + "\tdvc exp branch \n", + "\n", "\u001b[0m" ] } ], "source": [ - "!dvc repro" + "!dvc exp run -n exp1-ratio-features" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:26:17.593890Z", @@ -1225,7 +1097,7 @@ "4 1.388889 7.0 " ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1241,7 +1113,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:26:20.852572Z", @@ -1253,15 +1125,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "On branch exp1-ratio-features\r\n", - "Changes not staged for commit:\r\n", - " (use \"git add ...\" to update what will be committed)\r\n", + "На ветке exp1-ratio-features\r\n", + "Изменения, которые не в индексе для коммита:\r\n", + " (используйте «git add <файл>…», чтобы добавить файл в индекс)\r\n", " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mmodified: dvc-3-automate-experiments.ipynb\u001b[m\r\n", - "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n", - "\t\u001b[31mmodified: src/featurization.py\u001b[m\r\n", + "\t\u001b[31mизменено: dvc-3-automate-experiments.ipynb\u001b[m\r\n", + "\t\u001b[31mизменено: dvc.lock\u001b[m\r\n", + "\t\u001b[31mизменено: src/featurization.py\u001b[m\r\n", "\r\n", - "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" + "нет изменений добавленных для коммита\r\n", + "(используйте «git add» и/или «git commit -a»)\r\n" ] } ], @@ -1271,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:26:28.905617Z", @@ -1353,7 +1226,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:32:52.254763Z", @@ -1404,12 +1277,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reproduce pipelines" + "### Create experiment" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2020-10-21T14:27:55.449989Z", @@ -1423,36 +1296,36 @@ "output_type": "stream", "text": [ "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Stage 'feature_extraction' didn't change, skipping\n", + "Stage 'split_dataset' didn't change, skipping\n", + "Stage 'train' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock' \n", "\n", - "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock'\n", + "Stage 'evaluate' is cached - skipping run, checking out outputs\n", + "Updating lock file 'dvc.lock' \n", + "\n", + "To track the changes with git, run:\n", "\n", - "Running stage 'train' with command:\n", - "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "\tgit add dvc.lock src/train.py\n", + " core\u001b[39m>\n", + "Reproduced experiment(s): exp2-tuning-logreg\n", + "Experiment results have been applied to your workspace.\n", "\n", - "Running stage 'evaluate' with command:\n", - "\tpython src/evaluate.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "To promote an experiment to a Git branch run:\n", "\n", - "To track the changes with git, run:\n", + "\tdvc exp branch \n", "\n", - "\tgit add dvc.lock\n", "\u001b[0m" ] } ], "source": [ - "# Re-run pipeline \n", - "\n", - "!dvc repro" + "!dvc exp run -n exp2-tuning-logreg" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:03:21.593735Z", @@ -1476,7 +1349,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:03:48.640425Z", @@ -1488,8 +1361,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: core\u001b[39m>\n", - "\t\tf1_score: 0.9305555555555555\n", + "Path f1_score core\u001b[39m>\n", + "data/metrics.json 0.93056\n", "\u001b[0m" ] } @@ -1500,26 +1373,35 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:03:52.823772Z", - "start_time": "2020-10-22T14:03:50.239522Z" - } - }, + "execution_count": 31, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 0.93056 0.77671\n", + "Path Metric Value Change\n", + "data/metrics.json f1_score 0.93056 0.77671\n", + "\n", + "Path Param Value Change\n", + "params.yaml data_load.classes_names_path data/classes.json diff not supported\n", + "params.yaml data_load.dummy_param dummy_value diff not supported\n", + "params.yaml data_load.raw_data_path data/iris.csv diff not supported\n", + "params.yaml data_split.test_path data/test.csv diff not supported\n", + "params.yaml data_split.test_size 0.2 0\n", + "params.yaml data_split.train_path data/train.csv diff not supported\n", + "params.yaml evaluate.confusion_matrix data/cm.csv diff not supported\n", + "params.yaml evaluate.metrics_file data/metrics.json diff not supported\n", + "params.yaml featurize.features_path data/iris_featurized.csv diff not supported\n", + "params.yaml featurize.target_column target diff not supported\n", + "params.yaml train.model_path data/model.joblib diff not supported\n", + "\n", "\u001b[0m" ] } ], "source": [ - "!dvc metrics diff --all" + "!dvc exp diff --all" ] }, { @@ -1607,12 +1489,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Reproduce pipeline " + "### Create experiment" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:36:35.537208Z", @@ -1627,28 +1509,36 @@ "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", "Stage 'feature_extraction' didn't change, skipping\n", "Stage 'split_dataset' didn't change, skipping\n", - "Running stage 'train' with command:\n", - "\tpython src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "Running stage 'train':\n", + "> python src/train.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' \n", "\n", - "Running stage 'evaluate' with command:\n", - "\tpython src/evaluate.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' core\u001b[39m>\n", + "Running stage 'evaluate':\n", + "> python src/evaluate.py --config=params.yaml\n", + "Updating lock file 'dvc.lock' \n", "\n", "To track the changes with git, run:\n", "\n", - "\tgit add dvc.lock\n", + "\tgit add dvc.lock src/train.py\n", + " core\u001b[39m>\n", + "Reproduced experiment(s): exp3-svm\n", + "Experiment results have been applied to your workspace.\n", + "\n", + "To promote an experiment to a Git branch run:\n", + "\n", + "\tdvc exp branch \n", + "\n", "\u001b[0m" ] } ], "source": [ - "!dvc repro" + "!dvc exp run -n exp3-svm" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:05:25.052314Z", @@ -1660,8 +1550,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: core\u001b[39m>\n", - "\t\tf1_score: 0.9665831244778613\n", + "Path f1_score core\u001b[39m>\n", + "data/metrics.json 1.0\n", "\u001b[0m" ] } @@ -1672,7 +1562,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:05:35.904182Z", @@ -1684,14 +1574,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "On branch exp3-svm\r\n", - "Changes not staged for commit:\r\n", - " (use \"git add ...\" to update what will be committed)\r\n", + "На ветке exp3-svm\r\n", + "Изменения, которые не в индексе для коммита:\r\n", + " (используйте «git add <файл>…», чтобы добавить файл в индекс)\r\n", " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mmodified: dvc.lock\u001b[m\r\n", - "\t\u001b[31mmodified: src/train.py\u001b[m\r\n", + "\t\u001b[31mизменено: dvc.lock\u001b[m\r\n", + "\t\u001b[31mизменено: src/train.py\u001b[m\r\n", "\r\n", - "no changes added to commit (use \"git add\" and/or \"git commit -a\")\r\n" + "нет изменений добавленных для коммита\r\n", + "(используйте «git add» и/или «git commit -a»)\r\n" ] } ], @@ -1803,6 +1694,7 @@ "text": [ "Path Param Old Newre\u001b[39m>\n", "params.yaml data_load.classes_names_path data/classes.json data/classes.json\n", + "params.yaml data_load.dummy_param dummy_value dummy_value\n", "params.yaml data_load.raw_data_path data/iris.csv data/iris.csv\n", "params.yaml data_split.test_path data/test.csv data/test.csv\n", "params.yaml data_split.test_size 0.2 0.2\n", @@ -1824,7 +1716,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:09:20.304575Z", @@ -1836,7 +1728,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"params.yaml\": {\"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"data_load.dummy_param\": {\"old\": \"dummy_value\", \"new\": \"dummy_value\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}, \"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}}}\n", + "{\"params.yaml\": {\"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"data_load.dummy_param\": {\"old\": \"dummy_value\", \"new\": \"dummy_value\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}}}\n", "\u001b[0m" ] } @@ -1847,7 +1739,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2020-07-03T19:09:27.495017Z", @@ -1883,7 +1775,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:08:29.319419Z", @@ -1895,85 +1787,93 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mcommit 336832e6c8c51861d58e258b6cf7bc5ddc750459\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp3_svm\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Thu Oct 22 17:05:44 2020 +0300\r\n", + "\u001b[33mcommit ecb3887bcb625b372862bd612d6cc4b7392bbd35\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp3_svm\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp3-svm\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 23:28:45 2021 +0900\r\n", "\r\n", " Experiment 3 with SVM estimator\r\n", "\r\n", - "\u001b[33mcommit aff5b7f5d143895108b4dac9939a9c0cd06a349d\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp2_tuning_logreg\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp2-tuning-logreg\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Thu Oct 22 17:04:33 2020 +0300\r\n", + "\u001b[33mcommit 707d2606c349c7b9a7fb8fe3fbcd44d632bcc127\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp2_tuning_logreg\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp2-tuning-logreg\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 23:26:19 2021 +0900\r\n", "\r\n", " Tune model. LogisticRegression. C=0.1\r\n", "\r\n", - "\u001b[33mcommit 7ab2b518063b63742a396ca83ce6a092a260589a\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Oct 21 17:27:03 2020 +0300\r\n", + "\u001b[33mcommit 4aab30df0f3eba12f1a11e672268912f8bcf2ebb\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 23:21:59 2021 +0900\r\n", "\r\n", " Experiment with new features\r\n", "\r\n", - "\u001b[33mcommit 7619688214cc3b9fe3d3b59674c07c12fc134b47\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Oct 21 17:24:13 2020 +0300\r\n", + "\u001b[33mcommit 78d4c7dcecef33f58ddead3d2066be0774c711f2\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 23:07:18 2021 +0900\r\n", "\r\n", - " Add stage evaluate\r\n", + " Complete DVC pipeline\r\n", "\r\n", - "\u001b[33mcommit 2a59d083d38b1a15dab6fe4c2b53c00a9d7f0447\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Oct 21 17:23:50 2020 +0300\r\n", + "\u001b[33mcommit abe17a966ef4b4fe0609a4d9e7782e2ead7ebee7\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 22:45:51 2021 +0900\r\n", "\r\n", - " Add stage train\r\n", + " Initialize DVC\r\n", "\r\n", - "\u001b[33mcommit 1a908460f95ed339ccdf735a193054b8af0632dc\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Oct 21 17:23:22 2020 +0300\r\n", + "\u001b[33mcommit 875d1963ff4a87bb9d3ae691ad3e80b8b985eeb0\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdvc-2\u001b[m\u001b[33m)\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 22:45:17 2021 +0900\r\n", "\r\n", - " Add stage split_dataset\r\n", + " Read config using python-box\r\n", "\r\n", - "\u001b[33mcommit 915ba326763d905b4c3559cb29ec825f48fb11f0\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Wed Oct 21 17:23:04 2020 +0300\r\n", + "\u001b[33mcommit 3a35f44c72841ab7e9a02ae0fed85141104e08f9\u001b[m\r\n", + "Author: AlexKolosov \r\n", + "Date: Mon Mar 15 22:44:36 2021 +0900\r\n", "\r\n", - " Add stage features_extraction\r\n", + " Update dependencies\r\n", "\r\n", - "\u001b[33mcommit 0e12b6aab49d3d96570f95950df411ccf00c6326\u001b[m\r\n", + "\u001b[33mcommit 5cdaaca1ac7b67fb1acad0ffee3bb8c60f31a70c\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/master\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/HEAD\u001b[m\u001b[33m, \u001b[m\u001b[1;32mmaster\u001b[m\u001b[33m)\u001b[m\r\n", + "Merge: 3b224bb f6f1318\r\n", "Author: Mikhail \r\n", - "Date: Wed Oct 21 12:56:45 2020 +0300\r\n", + "Date: Fri Oct 23 08:33:39 2020 +0000\r\n", "\r\n", - " Initialize DVC\r\n", - "\r\n", - "\u001b[33mcommit 25cb0004d22fdc923536ac5232e146361e74e90c\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/update-dvc-tutorials\u001b[m\u001b[33m, \u001b[m\u001b[1;32mupdate-dvc-tutorials\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Mon Oct 19 18:03:15 2020 +0900\r\n", - "\r\n", - " Update tutorial notebook\r\n", + " Merge branch 'dev' into 'master'\r\n", + " \r\n", + " Release 1.0\r\n", + " \r\n", + " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!6\r\n", "\r\n", - "\u001b[33mcommit bf430d9af143db178d6fa39b405437ee9f8ab1f3\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Mon Oct 19 17:36:07 2020 +0900\r\n", + "\u001b[33mcommit f6f1318f12e40af52206d8307491569fb9269333\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m)\u001b[m\r\n", + "Merge: aa6bd54 3b224bb\r\n", + "Author: Mikhail \r\n", + "Date: Fri Oct 23 08:33:32 2020 +0000\r\n", "\r\n", - " Update .gitignore\r\n", + " Merge branch 'master' into 'dev'\r\n", + " \r\n", + " # Conflicts:\r\n", + " # README.md\r\n", "\r\n", - "\u001b[33mcommit 0ecca2cdab5972ba68f88cc395b2c4ba4dc47dcf\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Mon Oct 19 17:35:42 2020 +0900\r\n", + "\u001b[33mcommit aa6bd541d9212b8b153942f708a205c611081329\u001b[m\r\n", + "Merge: 15bd59f d135235\r\n", + "Author: Mikhail \r\n", + "Date: Fri Oct 23 08:32:29 2020 +0000\r\n", "\r\n", - " Update documentation\r\n", + " Merge branch 'release-1.0' into 'dev'\r\n", + " \r\n", + " Release 1.0\r\n", + " \r\n", + " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!5\r\n", "\r\n", - "\u001b[33mcommit c6293c84e8aa3b0e65afaaecc1a3c9b03d6b2dec\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Mon Oct 19 17:35:23 2020 +0900\r\n", + "\u001b[33mcommit d1352356bb87bc620815ad32c0ddd2b37ef75b14\u001b[m\r\n", + "Author: Mikhail \r\n", + "Date: Fri Oct 23 08:32:29 2020 +0000\r\n", "\r\n", - " Update versions of python libraries\r\n", + " Release 1.0\r\n", "\r\n", - "\u001b[33mcommit 15bd59fe85e1e002d1ea45230dc61f3b9c4dcfe3\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 15bd59fe85e1e002d1ea45230dc61f3b9c4dcfe3\u001b[m\r\n", "Author: Alex \r\n", "Date: Tue Aug 18 12:45:16 2020 +0900\r\n", "\r\n", " Update DVC -> 1.5.0\r\n", "\r\n", - "\u001b[33mcommit 3573af273082ae1cad1b141131eb824e314eee43\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdev\u001b[m\u001b[33m)\u001b[m\r\n", + "\u001b[33mcommit 3573af273082ae1cad1b141131eb824e314eee43\u001b[m\r\n", "Merge: aeecba0 1102dc2\r\n", "Author: Mikhail \r\n", "Date: Sat Jul 4 07:23:25 2020 +0300\r\n", @@ -2089,7 +1989,13 @@ "Author: Alex \r\n", "Date: Tue Jun 9 19:27:13 2020 +0900\r\n", "\r\n", - " create repo structure for lesson 4\r\n" + " create repo structure for lesson 4\r\n", + "\r\n", + "\u001b[33mcommit 3b224bb7dad464cfc9165606c468ece07a8b7e8f\u001b[m\r\n", + "Author: Alexander Kolosov \r\n", + "Date: Tue Jun 9 06:13:04 2020 +0000\r\n", + "\r\n", + " Initial commit\r\n" ] } ], @@ -2130,7 +2036,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:09:14.039904Z", @@ -2143,8 +2049,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\tdata/metrics.json: core\u001b[39m>\n", - "\t\tf1_score: 0.9665831244778613\n", + "Path f1_score core\u001b[39m>\n", + "data/metrics.json 1.0\n", "\u001b[0m" ] } @@ -2157,7 +2063,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:09:18.038960Z", @@ -2169,27 +2075,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "dvc-tutorial: core\u001b[39m>\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9665831244778613\n", - "exp1-ratio-features:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.15384615384615383\n", - "exp2-tuning-logreg:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9305555555555555\n", - "exp3-svm:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9665831244778613\n", - "exp1_ratio_features:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.15384615384615383\n", - "exp2_tuning_logreg:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9305555555555555\n", - "exp3_svm:\n", - "\tdata/metrics.json:\n", - "\t\tf1_score: 0.9665831244778613\n", + "Revision Path f1_score core\u001b[39m>\n", + "workspace data/metrics.json 1.0\n", + "dvc-tutorial, exp3-svm, exp3_svm data/metrics.json 1.0\n", + "exp1-ratio-features, exp1_ratio_features data/metrics.json 0.15385\n", + "exp2-tuning-logreg, exp2_tuning_logreg data/metrics.json 0.93056\n", "\u001b[0m" ] } @@ -2209,7 +2099,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:09:38.852532Z", @@ -2231,7 +2121,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:09:41.557718Z", @@ -2243,8 +2133,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.96658 0.96658 0.0\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 1.0 1.0 0.0\n", "\u001b[0m" ] } @@ -2264,7 +2154,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:09:57.211470Z", @@ -2276,8 +2166,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 0.96658 0.81274\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 1.0 0.84615\n", "\u001b[0m" ] } @@ -2290,7 +2180,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:10:03.187146Z", @@ -2302,8 +2192,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 0.96658 0.81274\n", + "Path Metric Old New Change core\u001b[39m>\n", + "data/metrics.json f1_score 0.15385 1.0 0.84615\n", "\u001b[0m" ] } @@ -2316,7 +2206,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:10:38.911976Z", @@ -2329,9 +2219,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "| Path | Metric | Old | New | Change | core\u001b[39m>\n", - "|-------------------|----------|---------|---------|----------|\n", - "| data/metrics.json | f1_score | 0.15385 | 0.96658 | 0.81274 |\n", + "| Path | Metric | Old | New | Change | core\u001b[39m>\n", + "|-------------------|----------|---------|-------|----------|\n", + "| data/metrics.json | f1_score | 0.15385 | 1.0 | 0.84615 |\n", "\n", "\u001b[0m" ] @@ -2341,12 +2231,89 @@ "!dvc metrics diff exp1-ratio-features --show-md" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare experiments" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m" + ] + } + ], + "source": [ + "!dvc exp diff" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path Metric Value Change\n", + "data/metrics.json f1_score 0.93056 0.77671\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!dvc exp diff exp1-ratio-features exp2-tuning-logreg" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path Metric Value Change\n", + "data/metrics.json f1_score 1 0.84615\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!dvc exp diff exp1-ratio-features exp3-svm" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path Metric Value Change\n", + "data/metrics.json f1_score 1 0.069444\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!dvc exp diff exp2-tuning-logreg exp3-svm" + ] }, { "cell_type": "markdown", @@ -2357,7 +2324,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:11:04.697127Z", @@ -2378,7 +2345,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 55, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:15:07.599224Z", @@ -2390,7 +2357,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "file:///Users/mnrozhkov/dev/dvc/dvc-3-automate-experiments/data/plots-show.html\n", + "file:///home/alex/Dev/Projects/tutorials-dvc/dvc-3-automate-experiments/data/plots-show.html\n", "\u001b[0m" ] } @@ -2401,7 +2368,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2020-10-22T14:15:07.664691Z", @@ -2424,10 +2391,10 @@ " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 61, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -2528,7 +2495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" }, "toc": { "base_numbering": 1, @@ -2543,7 +2510,7 @@ "height": "calc(100% - 180px)", "left": "10px", "top": "150px", - "width": "230.947px" + "width": "230.938px" }, "toc_section_display": true, "toc_window_display": true From f834df1c2dc72de4b6e7492e0620326bdcde9923 Mon Sep 17 00:00:00 2001 From: AlexKolosov Date: Tue, 16 Mar 2021 13:17:35 +0900 Subject: [PATCH 6/8] Update documentation and tutorial notebook --- README.md | 2 +- dvc-3-automate-experiments.ipynb | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b134091..841fc8b 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ python -m ipykernel install --user --name=dvc-venv ## 5. Configure ToC for jupyter notebook (optional) ```bash -sudo jupyter contrib nbextension install +jupyter contrib nbextension install --user jupyter nbextension enable toc2/main ``` diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb index 4661934..e0e9416 100644 --- a/dvc-3-automate-experiments.ipynb +++ b/dvc-3-automate-experiments.ipynb @@ -7,8 +7,13 @@ "# Install and init DVC\n", "\n", "Prerequisites: \n", - "- DVC and requirements.txt packages installed (if not - check README.md file for instructions)\n", - "- A project repository is a Git repo \n", + "- DVC and requirements.txt packages installed (if not - check README.md file for instructions)\n", + "- a project repository is a Git repo\n", + "- CLI program `tree`\n", + "\n", + "```bash\n", + "sudo apt install tree\n", + "```\n", "\n" ] }, From ec41d3913e682b2c17090ed4ef268cc6c98aa862 Mon Sep 17 00:00:00 2001 From: AlexanderKolosov Date: Fri, 19 May 2023 20:08:13 +0900 Subject: [PATCH 7/8] Update tutorial: 1. delete jupyter packages 2. delete tutorial jupyter notebook 3. update packages versions 4. update README.md --- .gitignore | 20 +- README.md | 40 +- dvc-3-automate-experiments.ipynb | 2555 ------------------------------ params.yaml | 2 +- requirements.txt | 23 +- 5 files changed, 27 insertions(+), 2613 deletions(-) delete mode 100644 dvc-3-automate-experiments.ipynb diff --git a/.gitignore b/.gitignore index f6307b8..eb43ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,15 @@ -# folders +# Environments +.venv -## IDEs +# IDEs .idea +.vscode -## Python -dvc-venv +# Mac OS +.DS_Store + +# Python __pycache__ -.ipynb_checkpoints -## Project +# Project data - -# files - -## Mac OS -.DS_Store diff --git a/README.md b/README.md index 841fc8b..e9f6e7a 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,17 @@ -# Tutorial: Automate DVC experiments (lesson 3) -## Machine Learning experiments reproducibility and engineering with DVC // ML REPA School +# Tutorial: Automate DVC experiments -## 1. Clone this repository +## 1. Create and activate virtual environment -```bash -git https://github.com/mlrepa/dvc-3-automate-experiments.git -cd dvc-3-automate-experiments -``` +Create virtual environment -## 2. Create and activate virtual environment - -Create virtual environment named `dvc` (you may use other name) ```bash -python3 -m venv dvc-venv -echo "export PYTHONPATH=$PWD" >> dvc-venv/bin/activate -source dvc-venv/bin/activate +python3 -m venv .venv +echo "export PYTHONPATH=$PWD" >> .venv/bin/activate +source .venv/bin/activate ``` -## 3. Install python libraries +## 2. Install python libraries ```bash pip install -r requirements.txt ``` - -## 4. Add Virtual Environment to Jupyter Notebook - -```bash -python -m ipykernel install --user --name=dvc-venv -``` - -## 5. Configure ToC for jupyter notebook (optional) - -```bash -jupyter contrib nbextension install --user -jupyter nbextension enable toc2/main -``` - -## 6. Run and follow Jupyter Notebook `dvc-3-automate-experiments.ipynb` for instructions: - -```bash -jupyter notebook -``` diff --git a/dvc-3-automate-experiments.ipynb b/dvc-3-automate-experiments.ipynb deleted file mode 100644 index e0e9416..0000000 --- a/dvc-3-automate-experiments.ipynb +++ /dev/null @@ -1,2555 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Install and init DVC\n", - "\n", - "Prerequisites: \n", - "- DVC and requirements.txt packages installed (if not - check README.md file for instructions)\n", - "- a project repository is a Git repo\n", - "- CLI program `tree`\n", - "\n", - "```bash\n", - "sudo apt install tree\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Checkout branch `tutorial`" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:19.401395Z", - "start_time": "2020-07-01T07:32:19.271265Z" - } - }, - "source": [ - "```bash\n", - "git checkout -b dvc-tutorial\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2019-06-08T11:18:29.199273Z", - "start_time": "2019-06-08T11:18:29.196865Z" - } - }, - "source": [ - "## Initialize DVC\n", - "\n", - "References: \n", - "- https://dvc.org/doc/get-started/initialize " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:22.463407Z", - "start_time": "2020-07-01T07:32:21.450728Z" - } - }, - "source": [ - "```bash\n", - "dvc init\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Commit changes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-01T07:32:26.446894Z", - "start_time": "2020-07-01T07:32:26.392814Z" - } - }, - "source": [ - "```bash\n", - "\n", - "git add .\n", - "git commit -m \"Initialize DVC\"\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build automated pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## params.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.877998Z", - "start_time": "2020-07-03T19:30:37.755666Z" - }, - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "data_load:\r\n", - " raw_data_path: data/iris.csv\r\n", - " classes_names_path: data/classes.json\r\n", - "\r\n", - "featurize:\r\n", - " features_path: data/iris_featurized.csv\r\n", - " target_column: target\r\n", - "\r\n", - "\r\n", - "data_split:\r\n", - " test_size: 0.2\r\n", - " train_path: data/train.csv\r\n", - " test_path: data/test.csv\r\n", - "\r\n", - "\r\n", - "train:\r\n", - " model_path: data/model.joblib\r\n", - "\r\n", - "\r\n", - "evaluate:\r\n", - " metrics_file: data/metrics.json\r\n", - " confusion_matrix: data/cm.csv\r\n" - ] - } - ], - "source": [ - "!cat params.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create `data_load` stage\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:35.023136Z", - "start_time": "2020-07-03T19:30:34.904974Z" - } - }, - "source": [ - "```bash\n", - "# Create `data` directory\n", - "\n", - "mkdir -p data\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T09:58:42.844179Z", - "start_time": "2020-10-21T09:58:42.840016Z" - } - }, - "source": [ - "```bash\n", - "# Create data_load pipeline stage\n", - "\n", - "dvc stage add \\\n", - " -n data_load \\\n", - " -d src/data_load.py \\\n", - " -o data/iris.csv \\\n", - " -o data/classes.json \\\n", - " -p data_load \\\n", - " python src/data_load.py \\\n", - " --config=params.yaml\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:37.727096Z", - "start_time": "2020-07-03T19:30:37.609182Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reproduce a pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:39.781553Z", - "start_time": "2020-07-03T19:30:37.923002Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running stage 'data_load': core\u001b[39m>\n", - "> python src/data_load.py --config=params.yaml\n", - "Generating lock file 'dvc.lock' \n", - "Updating lock file 'dvc.lock'\n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "Use `dvc push` to send your updates to remote storage.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T13:42:01.176530Z", - "start_time": "2020-10-21T13:42:01.147399Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4,0K\tdata/classes.json\n", - "4,0K\tdata/cm.csv\n", - "4,0K\tdata/iris.csv\n", - "4,0K\tdata/iris_featurized.csv\n", - "4,0K\tdata/metrics.json\n", - "4,0K\tdata/model.joblib\n", - "4,0K\tdata/test.csv\n", - "4,0K\tdata/train.csv\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "du -sh data/*" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T10:03:42.595865Z", - "start_time": "2020-10-21T10:03:42.471874Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[01;34m.\u001b[00m\r\n", - "├── \u001b[01;34mdata\u001b[00m\r\n", - "│   ├── classes.json\r\n", - "│   ├── cm.csv\r\n", - "│   ├── iris.csv\r\n", - "│   ├── iris_featurized.csv\r\n", - "│   ├── metrics.json\r\n", - "│   ├── model.joblib\r\n", - "│   ├── test.csv\r\n", - "│   └── train.csv\r\n", - "├── dvc-3-automate-experiments.ipynb\r\n", - "├── dvc.lock\r\n", - "├── dvc.yaml\r\n", - "├── params.yaml\r\n", - "├── README.md\r\n", - "├── requirements.txt\r\n", - "└── \u001b[01;34msrc\u001b[00m\r\n", - " ├── data_load.py\r\n", - " ├── evaluate.py\r\n", - " ├── featurization.py\r\n", - " ├── __init__.py\r\n", - " ├── \u001b[01;34m__pycache__\u001b[00m\r\n", - " │   ├── __init__.cpython-38.pyc\r\n", - " │   └── utils.cpython-38.pyc\r\n", - " ├── split_dataset.py\r\n", - " ├── train.py\r\n", - " └── utils.py\r\n", - "\r\n", - "3 directories, 23 files\r\n" - ] - } - ], - "source": [ - "# Note: we use `tree -I ...` pattern to not list those files that match the wild-card pattern.\n", - "\n", - "!tree -I dvc-venv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Change params.yaml and reproduce \n", - "\n", - "Add a new line into `data_load` section:\n", - " `dummy_param: dummy_value`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:41.698409Z", - "start_time": "2020-07-03T19:30:39.807607Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running stage 'data_load': core\u001b[39m>\n", - "> python src/data_load.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "Use `dvc push` to send your updates to remote storage.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build end-to-end Machine Learning pipeline\n", - "Stages \n", - "- extract features \n", - "- split dataset \n", - "- train \n", - "- evaluate \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add feature extraction stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.387596Z", - "start_time": "2020-07-03T19:30:43.388868Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc stage add \\\n", - " -n feature_extraction \\\n", - " -d src/featurization.py \\\n", - " -d data/iris.csv \\\n", - " -o data/iris_featurized.csv \\\n", - " -p data_load,featurize \\\n", - " python src/featurization.py \\\n", - " --config=params.yaml\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add split train/test stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:48.044867Z", - "start_time": "2020-07-03T19:30:45.984594Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc stage add \\\n", - " -n split_dataset \\\n", - " -d src/split_dataset.py \\\n", - " -d data/iris_featurized.csv \\\n", - " -o data/train.csv \\\n", - " -o data/test.csv \\\n", - " -p featurize,data_split \\\n", - " python src/split_dataset.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add train stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:50.298161Z", - "start_time": "2020-07-03T19:30:48.275068Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc stage add \\\n", - " -n train \\\n", - " -d src/train.py \\\n", - " -d data/train.csv \\\n", - " -o data/model.joblib \\\n", - " -p data_split,train \\\n", - " python src/train.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add evaluate stage" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T13:03:19.945663Z", - "start_time": "2020-10-21T13:03:19.941005Z" - } - }, - "source": [ - "```bash\n", - "\n", - "dvc stage add \\\n", - " -n evaluate \\\n", - " -d src/evaluate.py \\\n", - " -d data/test.csv \\\n", - " -d data/model.joblib \\\n", - " -d data/classes.json \\\n", - " -m data/metrics.json \\\n", - " --plots data/cm.csv \\\n", - " -p data_load,data_split,train,evaluate \\\n", - " python src/evaluate.py \\\n", - " --config=params.yaml\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:22:48.664322Z", - "start_time": "2020-10-21T14:22:48.539481Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data\t\t\t\t dvc.lock dvc.yaml\t README.md\t src\r\n", - "dvc-3-automate-experiments.ipynb dvc-venv params.yaml requirements.txt\r\n" - ] - } - ], - "source": [ - "!ls " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:23:25.865051Z", - "start_time": "2020-10-21T14:23:25.749060Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "stages:\r\n", - " data_load:\r\n", - " cmd: python src/data_load.py --config=params.yaml\r\n", - " deps:\r\n", - " - src/data_load.py\r\n", - " params:\r\n", - " - data_load\r\n", - " outs:\r\n", - " - data/classes.json\r\n", - " - data/iris.csv\r\n", - " feature_extraction:\r\n", - " cmd: python src/featurization.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris.csv\r\n", - " - src/featurization.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/iris_featurized.csv\r\n", - " split_dataset:\r\n", - " cmd: python src/split_dataset.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/iris_featurized.csv\r\n", - " - src/split_dataset.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - featurize\r\n", - " outs:\r\n", - " - data/test.csv\r\n", - " - data/train.csv\r\n", - " train:\r\n", - " cmd: python src/train.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/train.csv\r\n", - " - src/train.py\r\n", - " params:\r\n", - " - data_split\r\n", - " - train\r\n", - " outs:\r\n", - " - data/model.joblib\r\n", - " evaluate:\r\n", - " cmd: python src/evaluate.py --config=params.yaml\r\n", - " deps:\r\n", - " - data/classes.json\r\n", - " - data/model.joblib\r\n", - " - data/test.csv\r\n", - " - src/evaluate.py\r\n", - " params:\r\n", - " - data_load\r\n", - " - data_split\r\n", - " - evaluate\r\n", - " - train\r\n", - " metrics:\r\n", - " - data/metrics.json\r\n", - " plots:\r\n", - " - data/cm.csv\r\n" - ] - } - ], - "source": [ - "!cat dvc.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reproduce DVC pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Running stage 'feature_extraction':\n", - "> python src/featurization.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'train' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Running stage 'evaluate':\n", - "> python src/evaluate.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock\n", - "Use `dvc push` to send your updates to remote storage.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc repro" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:22:58.124155Z", - "start_time": "2020-10-21T14:22:57.646371Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sepal_lengthsepal_widthpetal_lengthpetal_widthtarget
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", - "
" - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width target\n", - "0 5.1 3.5 1.4 0.2 0\n", - "1 4.9 3.0 1.4 0.2 0\n", - "2 4.7 3.2 1.3 0.2 0\n", - "3 4.6 3.1 1.5 0.2 0\n", - "4 5.0 3.6 1.4 0.2 0" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "features = pd.read_csv('data/iris_featurized.csv')\n", - "features.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Commit changes " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T12:58:37.280454Z", - "start_time": "2020-10-21T12:58:37.272728Z" - } - }, - "source": [ - "```bash\n", - "# Check Git status\n", - "\n", - "git status -s\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:30:45.961182Z", - "start_time": "2020-07-03T19:30:45.916816Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes \n", - "\n", - "git add .\n", - "git commit -m \"Complete DVC pipeline\"\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Experimenting with reproducible pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How reproduce experiments?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> The most exciting part of DVC is reproducibility.\n", - ">> Reproducibility is the time you are getting benefits out of DVC instead of spending time defining the ML pipelines.\n", - "\n", - "> DVC tracks all the dependencies, which helps you iterate on ML models faster without thinking what was affected by your last change.\n", - ">> In order to track all the dependencies, DVC finds and reads ALL the DVC-files in a repository and builds a dependency graph (DAG) based on these files.\n", - "\n", - "> This is one of the differences between DVC reproducibility and traditional Makefile-like build automation tools (Make, Maven, Ant, Rakefile etc). It was designed in such a way to localize specification of DAG nodes.\n", - "If you run repro on any created DVC-file from our repository, nothing happens because nothing was changed in the defined pipeline.\n", - "\n", - "(c) dvc.org https://dvc.org/doc/tutorial/reproducibility" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:02.889684Z", - "start_time": "2020-07-03T19:31:00.936546Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Stage 'train' didn't change, skipping\n", - "Stage 'evaluate' didn't change, skipping\n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Nothing to reproduce\n", - "\n", - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 1: Add features\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create new experiment branch\n", - "\n", - "Before editing the code/featurization.py file, please create and checkout a new branch __ratio_features__" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:05.089755Z", - "start_time": "2020-07-03T19:31:04.832150Z" - } - }, - "source": [ - "```bash\n", - "# Create new branch\n", - "\n", - "git checkout -b exp1-ratio-features\n", - "git branch\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update featurization.py" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "in file __featurization.py__ in function`get_features()` after line \n", - "\n", - "```python\n", - " features = get_features(dataset)\n", - "```\n", - "\n", - "add lines:\n", - "\n", - "```python\n", - " features['sepal_length_to_sepal_width'] = features['sepal_length'] / features['sepal_width']\n", - " features['petal_length_to_petal_width'] = features['petal_length'] / features['petal_width']\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create experiment" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:26:16.419175Z", - "start_time": "2020-10-21T14:26:13.569917Z" - }, - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'split_dataset' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'train' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'evaluate' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock src/featurization.py\n", - " core\u001b[39m>\n", - "Reproduced experiment(s): exp1-ratio-features\n", - "Experiment results have been applied to your workspace.\n", - "\n", - "To promote an experiment to a Git branch run:\n", - "\n", - "\tdvc exp branch \n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp run -n exp1-ratio-features" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:26:17.593890Z", - "start_time": "2020-10-21T14:26:17.580883Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sepal_lengthsepal_widthpetal_lengthpetal_widthtargetsepal_length_to_sepal_widthpetal_length_to_petal_width
05.13.51.40.201.4571437.0
14.93.01.40.201.6333337.0
24.73.21.30.201.4687506.5
34.63.11.50.201.4838717.5
45.03.61.40.201.3888897.0
\n", - "
" - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width target \\\n", - "0 5.1 3.5 1.4 0.2 0 \n", - "1 4.9 3.0 1.4 0.2 0 \n", - "2 4.7 3.2 1.3 0.2 0 \n", - "3 4.6 3.1 1.5 0.2 0 \n", - "4 5.0 3.6 1.4 0.2 0 \n", - "\n", - " sepal_length_to_sepal_width petal_length_to_petal_width \n", - "0 1.457143 7.0 \n", - "1 1.633333 7.0 \n", - "2 1.468750 6.5 \n", - "3 1.483871 7.5 \n", - "4 1.388889 7.0 " - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Check features used in this pipeline\n", - "\n", - "import pandas as pd\n", - "\n", - "features = pd.read_csv('data/iris_featurized.csv')\n", - "features.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:26:20.852572Z", - "start_time": "2020-10-21T14:26:20.724853Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "На ветке exp1-ratio-features\r\n", - "Изменения, которые не в индексе для коммита:\r\n", - " (используйте «git add <файл>…», чтобы добавить файл в индекс)\r\n", - " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mизменено: dvc-3-automate-experiments.ipynb\u001b[m\r\n", - "\t\u001b[31mизменено: dvc.lock\u001b[m\r\n", - "\t\u001b[31mизменено: src/featurization.py\u001b[m\r\n", - "\r\n", - "нет изменений добавленных для коммита\r\n", - "(используйте «git add» и/или «git commit -a»)\r\n" - ] - } - ], - "source": [ - "!git status" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:26:28.905617Z", - "start_time": "2020-10-21T14:26:26.333849Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 0.15385 0.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Get difference with metric from previous pipeline\n", - "!dvc metrics diff --all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commit the experiment changes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:31:39.838836Z", - "start_time": "2020-07-03T19:31:39.445353Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes\n", - "\n", - "git add .\n", - "git commit -m \"Experiment with new features\"\n", - "git tag -a \"exp1_ratio_features\" -m \"Experiment with new features\"\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 2: Tune Logistic Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a new experiment branch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:32:43.387938Z", - "start_time": "2020-07-03T19:32:43.131917Z" - } - }, - "source": [ - "```bash\n", - "# Create new branch for experiment\n", - "\n", - "git checkout -b exp2-tuning-logreg\n", - "git branch\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:32:52.254763Z", - "start_time": "2020-07-03T19:32:50.225661Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Stage 'train' didn't change, skipping\n", - "Stage 'evaluate' didn't change, skipping\n", - "Data and pipelines are up to date.\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Nothing to reproduce since code was checked out by `git checkout`\n", - "# and data files were checked out by `dvc checkout`\n", - "!dvc repro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tuning parameters\n", - "\n", - "in file __train.py__ :\n", - "\n", - "replace LogisticRegression params with:\n", - "\n", - "```python\n", - " clf = LogisticRegression(C=0.01, solver='lbfgs', multi_class='multinomial', max_iter=100)\n", - "```\n", - "__Note__: here we changed logistic regresssion hyperparameters: C to 0.1\n", - "\n", - "\n", - "https://dvc.org/doc/tutorials/get-started/experiments#tuning-parameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create experiment" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-21T14:27:55.449989Z", - "start_time": "2020-10-21T14:27:50.294369Z" - }, - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Stage 'train' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Stage 'evaluate' is cached - skipping run, checking out outputs\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock src/train.py\n", - " core\u001b[39m>\n", - "Reproduced experiment(s): exp2-tuning-logreg\n", - "Experiment results have been applied to your workspace.\n", - "\n", - "To promote an experiment to a Git branch run:\n", - "\n", - "\tdvc exp branch \n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp run -n exp2-tuning-logreg" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:03:21.593735Z", - "start_time": "2020-10-22T14:03:21.473130Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"f1_score\": 0.9305555555555555}" - ] - } - ], - "source": [ - "# Get difference with metric from previous pipeline\n", - "\n", - "!cat data/metrics.json" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:03:48.640425Z", - "start_time": "2020-10-22T14:03:47.009948Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path f1_score core\u001b[39m>\n", - "data/metrics.json 0.93056\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change\n", - "data/metrics.json f1_score 0.93056 0.77671\n", - "\n", - "Path Param Value Change\n", - "params.yaml data_load.classes_names_path data/classes.json diff not supported\n", - "params.yaml data_load.dummy_param dummy_value diff not supported\n", - "params.yaml data_load.raw_data_path data/iris.csv diff not supported\n", - "params.yaml data_split.test_path data/test.csv diff not supported\n", - "params.yaml data_split.test_size 0.2 0\n", - "params.yaml data_split.train_path data/train.csv diff not supported\n", - "params.yaml evaluate.confusion_matrix data/cm.csv diff not supported\n", - "params.yaml evaluate.metrics_file data/metrics.json diff not supported\n", - "params.yaml featurize.features_path data/iris_featurized.csv diff not supported\n", - "params.yaml featurize.target_column target diff not supported\n", - "params.yaml train.model_path data/model.joblib diff not supported\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp diff --all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commit changes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:15.808072Z", - "start_time": "2020-07-03T19:36:15.762972Z" - }, - "scrolled": true - }, - "source": [ - "```bash\n", - "# Commit changes\n", - "\n", - "git add .\n", - "git commit -m \"Tune model. LogisticRegression. C=0.1\"\n", - "git tag -a \"exp2_tuning_logreg\" -m \"Tune model. LogisticRegression. C=0.01\"\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Experiment 3: Use SVM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:14:01.831192Z", - "start_time": "2020-07-03T19:14:01.829062Z" - } - }, - "source": [ - "### Create a new experiment branch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:20.443851Z", - "start_time": "2020-07-03T19:36:20.187021Z" - } - }, - "source": [ - "```bash\n", - "# Create a new experiment branch \n", - "\n", - "git checkout -b exp3-svm\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update train.py\n", - "\n", - "in file __train.py__ replace line\n", - "\n", - "```python\n", - " clf = LogisticRegression(C=0.1, solver='newton-cg', multi_class='multinomial', max_iter=100)\n", - "```\n", - "\n", - "with line\n", - "\n", - "```python\n", - " clf = SVC(C=0.01, kernel='linear', gamma='scale', degree=5)\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create experiment" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:35.537208Z", - "start_time": "2020-07-03T19:36:32.544097Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stage 'data_load' didn't change, skipping core\u001b[39m>\n", - "Stage 'feature_extraction' didn't change, skipping\n", - "Stage 'split_dataset' didn't change, skipping\n", - "Running stage 'train':\n", - "> python src/train.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "Running stage 'evaluate':\n", - "> python src/evaluate.py --config=params.yaml\n", - "Updating lock file 'dvc.lock' \n", - "\n", - "To track the changes with git, run:\n", - "\n", - "\tgit add dvc.lock src/train.py\n", - " core\u001b[39m>\n", - "Reproduced experiment(s): exp3-svm\n", - "Experiment results have been applied to your workspace.\n", - "\n", - "To promote an experiment to a Git branch run:\n", - "\n", - "\tdvc exp branch \n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp run -n exp3-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:05:25.052314Z", - "start_time": "2020-10-22T14:05:23.390114Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path f1_score core\u001b[39m>\n", - "data/metrics.json 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:05:35.904182Z", - "start_time": "2020-10-22T14:05:35.779856Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "На ветке exp3-svm\r\n", - "Изменения, которые не в индексе для коммита:\r\n", - " (используйте «git add <файл>…», чтобы добавить файл в индекс)\r\n", - " (use \"git restore ...\" to discard changes in working directory)\r\n", - "\t\u001b[31mизменено: dvc.lock\u001b[m\r\n", - "\t\u001b[31mизменено: src/train.py\u001b[m\r\n", - "\r\n", - "нет изменений добавленных для коммита\r\n", - "(используйте «git add» и/или «git commit -a»)\r\n" - ] - } - ], - "source": [ - "!git status" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commit changes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:36:41.766798Z", - "start_time": "2020-07-03T19:36:41.377185Z" - } - }, - "source": [ - "```bash\n", - "# Commit changes\n", - "\n", - "git add .\n", - "git commit -m \"Experiment 3 with SVM estimator\"\n", - "git tag -a \"exp3_svm\" -m \"Experiment 3 with SVM estimator\"\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Merge best experiment `dvc-tutorial ` branch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:45:17.537969Z", - "start_time": "2020-07-03T19:45:17.463715Z" - }, - "scrolled": true - }, - "source": [ - "```bash\n", - "# Merge the best experiment\n", - "\n", - "git checkout dvc-tutorial \n", - "git merge exp3_svm\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compare experiment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare params " - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:07:30.189016Z", - "start_time": "2020-10-22T14:07:27.537384Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m core\u001b[39m>" - ] - } - ], - "source": [ - "# Get params diffs \n", - "\n", - "!dvc params diff" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:07:47.322051Z", - "start_time": "2020-10-22T14:07:44.759864Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Param Old Newre\u001b[39m>\n", - "params.yaml data_load.classes_names_path data/classes.json data/classes.json\n", - "params.yaml data_load.dummy_param dummy_value dummy_value\n", - "params.yaml data_load.raw_data_path data/iris.csv data/iris.csv\n", - "params.yaml data_split.test_path data/test.csv data/test.csv\n", - "params.yaml data_split.test_size 0.2 0.2\n", - "params.yaml data_split.train_path data/train.csv data/train.csv\n", - "params.yaml evaluate.confusion_matrix data/cm.csv data/cm.csv\n", - "params.yaml evaluate.metrics_file data/metrics.json data/metrics.json\n", - "params.yaml featurize.features_path data/iris_featurized.csv data/iris_featurized.csv\n", - "params.yaml featurize.target_column target target\n", - "params.yaml train.model_path data/model.joblib data/model.joblib\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Compare parameters with a specific commit, a tag or any revision\n", - "\n", - "!dvc params diff --all" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:09:20.304575Z", - "start_time": "2020-07-03T19:09:18.649548Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"params.yaml\": {\"data_split.test_size\": {\"old\": 0.2, \"new\": 0.2, \"diff\": 0.0}, \"featurize.features_path\": {\"old\": \"data/iris_featurized.csv\", \"new\": \"data/iris_featurized.csv\"}, \"data_load.raw_data_path\": {\"old\": \"data/iris.csv\", \"new\": \"data/iris.csv\"}, \"data_load.dummy_param\": {\"old\": \"dummy_value\", \"new\": \"dummy_value\"}, \"featurize.target_column\": {\"old\": \"target\", \"new\": \"target\"}, \"data_load.classes_names_path\": {\"old\": \"data/classes.json\", \"new\": \"data/classes.json\"}, \"train.model_path\": {\"old\": \"data/model.joblib\", \"new\": \"data/model.joblib\"}, \"data_split.test_path\": {\"old\": \"data/test.csv\", \"new\": \"data/test.csv\"}, \"evaluate.metrics_file\": {\"old\": \"data/metrics.json\", \"new\": \"data/metrics.json\"}, \"evaluate.confusion_matrix\": {\"old\": \"data/cm.csv\", \"new\": \"data/cm.csv\"}, \"data_split.train_path\": {\"old\": \"data/train.csv\", \"new\": \"data/train.csv\"}}}\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc params diff --show-json --all" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2020-07-03T19:09:27.495017Z", - "start_time": "2020-07-03T19:09:25.848748Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| Path | Param | Old | New |\n", - "|-------------|------------------------------|--------------------------|--------------------------|\n", - "| params.yaml | data_load.classes_names_path | data/classes.json | data/classes.json |\n", - "| params.yaml | data_load.dummy_param | dummy_value | dummy_value |\n", - "| params.yaml | data_load.raw_data_path | data/iris.csv | data/iris.csv |\n", - "| params.yaml | data_split.test_path | data/test.csv | data/test.csv |\n", - "| params.yaml | data_split.test_size | 0.2 | 0.2 |\n", - "| params.yaml | data_split.train_path | data/train.csv | data/train.csv |\n", - "| params.yaml | evaluate.confusion_matrix | data/cm.csv | data/cm.csv |\n", - "| params.yaml | evaluate.metrics_file | data/metrics.json | data/metrics.json |\n", - "| params.yaml | featurize.features_path | data/iris_featurized.csv | data/iris_featurized.csv |\n", - "| params.yaml | featurize.target_column | target | target |\n", - "| params.yaml | train.model_path | data/model.joblib | data/model.joblib |\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc params diff --show-md --all" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:08:29.319419Z", - "start_time": "2020-10-22T14:08:29.189441Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mcommit ecb3887bcb625b372862bd612d6cc4b7392bbd35\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mdvc-tutorial\u001b[m\u001b[33m, \u001b[m\u001b[1;33mtag: exp3_svm\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp3-svm\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 23:28:45 2021 +0900\r\n", - "\r\n", - " Experiment 3 with SVM estimator\r\n", - "\r\n", - "\u001b[33mcommit 707d2606c349c7b9a7fb8fe3fbcd44d632bcc127\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp2_tuning_logreg\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp2-tuning-logreg\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 23:26:19 2021 +0900\r\n", - "\r\n", - " Tune model. LogisticRegression. C=0.1\r\n", - "\r\n", - "\u001b[33mcommit 4aab30df0f3eba12f1a11e672268912f8bcf2ebb\u001b[m\u001b[33m (\u001b[m\u001b[1;33mtag: exp1_ratio_features\u001b[m\u001b[33m, \u001b[m\u001b[1;32mexp1-ratio-features\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 23:21:59 2021 +0900\r\n", - "\r\n", - " Experiment with new features\r\n", - "\r\n", - "\u001b[33mcommit 78d4c7dcecef33f58ddead3d2066be0774c711f2\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 23:07:18 2021 +0900\r\n", - "\r\n", - " Complete DVC pipeline\r\n", - "\r\n", - "\u001b[33mcommit abe17a966ef4b4fe0609a4d9e7782e2ead7ebee7\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 22:45:51 2021 +0900\r\n", - "\r\n", - " Initialize DVC\r\n", - "\r\n", - "\u001b[33mcommit 875d1963ff4a87bb9d3ae691ad3e80b8b985eeb0\u001b[m\u001b[33m (\u001b[m\u001b[1;32mdvc-2\u001b[m\u001b[33m)\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 22:45:17 2021 +0900\r\n", - "\r\n", - " Read config using python-box\r\n", - "\r\n", - "\u001b[33mcommit 3a35f44c72841ab7e9a02ae0fed85141104e08f9\u001b[m\r\n", - "Author: AlexKolosov \r\n", - "Date: Mon Mar 15 22:44:36 2021 +0900\r\n", - "\r\n", - " Update dependencies\r\n", - "\r\n", - "\u001b[33mcommit 5cdaaca1ac7b67fb1acad0ffee3bb8c60f31a70c\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/master\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/HEAD\u001b[m\u001b[33m, \u001b[m\u001b[1;32mmaster\u001b[m\u001b[33m)\u001b[m\r\n", - "Merge: 3b224bb f6f1318\r\n", - "Author: Mikhail \r\n", - "Date: Fri Oct 23 08:33:39 2020 +0000\r\n", - "\r\n", - " Merge branch 'dev' into 'master'\r\n", - " \r\n", - " Release 1.0\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!6\r\n", - "\r\n", - "\u001b[33mcommit f6f1318f12e40af52206d8307491569fb9269333\u001b[m\u001b[33m (\u001b[m\u001b[1;31morigin/dev\u001b[m\u001b[33m)\u001b[m\r\n", - "Merge: aa6bd54 3b224bb\r\n", - "Author: Mikhail \r\n", - "Date: Fri Oct 23 08:33:32 2020 +0000\r\n", - "\r\n", - " Merge branch 'master' into 'dev'\r\n", - " \r\n", - " # Conflicts:\r\n", - " # README.md\r\n", - "\r\n", - "\u001b[33mcommit aa6bd541d9212b8b153942f708a205c611081329\u001b[m\r\n", - "Merge: 15bd59f d135235\r\n", - "Author: Mikhail \r\n", - "Date: Fri Oct 23 08:32:29 2020 +0000\r\n", - "\r\n", - " Merge branch 'release-1.0' into 'dev'\r\n", - " \r\n", - " Release 1.0\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!5\r\n", - "\r\n", - "\u001b[33mcommit d1352356bb87bc620815ad32c0ddd2b37ef75b14\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Fri Oct 23 08:32:29 2020 +0000\r\n", - "\r\n", - " Release 1.0\r\n", - "\r\n", - "\u001b[33mcommit 15bd59fe85e1e002d1ea45230dc61f3b9c4dcfe3\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Tue Aug 18 12:45:16 2020 +0900\r\n", - "\r\n", - " Update DVC -> 1.5.0\r\n", - "\r\n", - "\u001b[33mcommit 3573af273082ae1cad1b141131eb824e314eee43\u001b[m\r\n", - "Merge: aeecba0 1102dc2\r\n", - "Author: Mikhail \r\n", - "Date: Sat Jul 4 07:23:25 2020 +0300\r\n", - "\r\n", - " Updated evaluate and params\r\n", - "\r\n", - "\u001b[33mcommit aeecba0880f016303eaa20181d27b60bd3ceb388\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Sat Jul 4 07:16:31 2020 +0300\r\n", - "\r\n", - " Update experimeting and metrics section\r\n", - "\r\n", - "\u001b[33mcommit 1102dc2e3f636b2d37558f95a960c788f3de32ed\u001b[m\r\n", - "Merge: 855c61a 92ac211\r\n", - "Author: Mikhail \r\n", - "Date: Wed Jul 1 07:22:32 2020 +0000\r\n", - "\r\n", - " Merge branch 'update-confusion-matrix' into 'dev'\r\n", - " \r\n", - " update confusion matrix\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!4\r\n", - "\r\n", - "\u001b[33mcommit 92ac211f2139095965d0e26304d2d39003136def\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Tue Jun 30 13:08:30 2020 +0900\r\n", - "\r\n", - " update confusion matrix\r\n", - "\r\n", - "\u001b[33mcommit 855c61ac3f02f8938445fe749846e20d01e0f247\u001b[m\r\n", - "Merge: 22aeb23 7fbf4d8\r\n", - "Author: Alexander Kolosov \r\n", - "Date: Mon Jun 29 08:47:37 2020 +0000\r\n", - "\r\n", - " Merge branch 'dev-update-pipelines' into 'dev'\r\n", - " \r\n", - " Dev update pipelines\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!3\r\n", - "\r\n", - "\u001b[33mcommit 7fbf4d8f4e54be947f77dce09191b4f6fbb287f0\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Mon Jun 29 08:47:37 2020 +0000\r\n", - "\r\n", - " Dev update pipelines\r\n", - "\r\n", - "\u001b[33mcommit 22aeb23eb6b54f12f11c76a5714dbf6bff5f11f9\u001b[m\r\n", - "Author: Mikhail \r\n", - "Date: Sun Jun 28 19:02:29 2020 +0300\r\n", - "\r\n", - " Update name of tutorial and notebook\r\n", - "\r\n", - "\u001b[33mcommit 110a584e41fa7c140bbaf8130f70d4112e58d1a4\u001b[m\r\n", - "Merge: 2d7e834 a8d3200\r\n", - "Author: Mikhail \r\n", - "Date: Sat Jun 27 07:49:11 2020 +0000\r\n", - "\r\n", - " Merge branch 'update-software' into 'dev'\r\n", - " \r\n", - " Update software\r\n", - " \r\n", - " See merge request 7labs.ru/tutorials-dvc/dvc-3-automate-experiments!2\r\n", - "\r\n", - "\u001b[33mcommit a8d3200b8cbffdc4af1c7204710d217e9f685928\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Fri Jun 26 17:58:32 2020 +0900\r\n", - "\r\n", - " intall toc for jupyter notebook\r\n", - "\r\n", - "\u001b[33mcommit 8b042ad196928f9584b4bbce058625896af78d9d\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Fri Jun 26 17:58:12 2020 +0900\r\n", - "\r\n", - " upgrade dvc\r\n", - "\r\n", - "\u001b[33mcommit 2d7e834a6d115d1b47253377b3baaace559e3259\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Thu Jun 11 12:53:18 2020 +0900\r\n", - "\r\n", - " add data/ to .gitignore\r\n", - "\r\n", - "\u001b[33mcommit 8817b3ed1f82ed1c4feb9122d49237b37356e70e\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:56:32 2020 +0900\r\n", - "\r\n", - " update Lesson 4.ipynb: append description of dvc plots diff\r\n", - "\r\n", - "\u001b[33mcommit a8db726c3f368c39180d61d21f21bf6727db20c0\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:44:00 2020 +0900\r\n", - "\r\n", - " update Lesson 4.ipynb: add section for dvc metrics diff and dvc plots\r\n", - "\r\n", - "\u001b[33mcommit 77559e316fe6b5fd0a11f27a06fbc9eed1c2b606\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 22:43:09 2020 +0900\r\n", - "\r\n", - " update src/evaluate.py: put metric and confusion matrix in separated files\r\n", - "\r\n", - "\u001b[33mcommit a0afac2ff2dc7c5815c72ec3770888b67e5f04e7\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 12:05:51 2020 +0900\r\n", - "\r\n", - " refactor code modules\r\n", - "\r\n", - "\u001b[33mcommit 73846297879b1f1be3868c64e73b7d8ad6966b09\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Wed Jun 10 12:04:37 2020 +0900\r\n", - "\r\n", - " fix Lesson 4.ipynb\r\n", - "\r\n", - "\u001b[33mcommit b6ba776f8607c6481e34f8a40af4c23a5cd36990\u001b[m\r\n", - "Author: Alex \r\n", - "Date: Tue Jun 9 19:27:13 2020 +0900\r\n", - "\r\n", - " create repo structure for lesson 4\r\n", - "\r\n", - "\u001b[33mcommit 3b224bb7dad464cfc9165606c468ece07a8b7e8f\u001b[m\r\n", - "Author: Alexander Kolosov \r\n", - "Date: Tue Jun 9 06:13:04 2020 +0000\r\n", - "\r\n", - " Initial commit\r\n" - ] - } - ], - "source": [ - "# To see the difference between two specific commits, both need to be specified:\n", - "\n", - "!git log" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:08:58.251510Z", - "start_time": "2020-10-22T14:08:55.660152Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m core\u001b[39m>" - ] - } - ], - "source": [ - "!dvc params diff 7619688214cc3b9fe3d3b59674c07c12fc134b47 HEAD^" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Show metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:09:14.039904Z", - "start_time": "2020-10-22T14:09:12.356063Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path f1_score core\u001b[39m>\n", - "data/metrics.json 1.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# this pipeline metrics \n", - "\n", - "!dvc metrics show" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:09:18.038960Z", - "start_time": "2020-10-22T14:09:16.234043Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Revision Path f1_score core\u001b[39m>\n", - "workspace data/metrics.json 1.0\n", - "dvc-tutorial, exp3-svm, exp3_svm data/metrics.json 1.0\n", - "exp1-ratio-features, exp1_ratio_features data/metrics.json 0.15385\n", - "exp2-tuning-logreg, exp2_tuning_logreg data/metrics.json 0.93056\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# show all commited pipelines metrics (all branch and tags)\n", - "\n", - "!dvc metrics show -a -T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare metrics (get differences)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:09:38.852532Z", - "start_time": "2020-10-22T14:09:36.162510Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m core\u001b[39m>" - ] - } - ], - "source": [ - "!dvc metrics diff" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:09:41.557718Z", - "start_time": "2020-10-22T14:09:38.912543Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 1.0 1.0 0.0\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# --all - list all metrics, even those without changes\n", - "\n", - "!dvc metrics diff --all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* чтобы сравнить текущую метрики из текущего коммита и из другого, нужно указать другой (old) коммит:" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:09:57.211470Z", - "start_time": "2020-10-22T14:09:54.710817Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 1.0 0.84615\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Compare old and new branches\n", - "\n", - "!dvc metrics diff exp1-ratio-features exp3-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:10:03.187146Z", - "start_time": "2020-10-22T14:10:00.568957Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Old New Change core\u001b[39m>\n", - "data/metrics.json f1_score 0.15385 1.0 0.84615\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Equivalent to `!dvc metrics diff exp1-ratio-features dvc-tutorial`, because dvc-tutorial - current branch\n", - "\n", - "!dvc metrics diff exp1-ratio-features" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:10:38.911976Z", - "start_time": "2020-10-22T14:10:36.304792Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| Path | Metric | Old | New | Change | core\u001b[39m>\n", - "|-------------------|----------|---------|-------|----------|\n", - "| data/metrics.json | f1_score | 0.15385 | 1.0 | 0.84615 |\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc metrics diff exp1-ratio-features --show-md" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare experiments" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp diff" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change\n", - "data/metrics.json f1_score 0.93056 0.77671\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp diff exp1-ratio-features exp2-tuning-logreg" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change\n", - "data/metrics.json f1_score 1 0.84615\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp diff exp1-ratio-features exp3-svm" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path Metric Value Change\n", - "data/metrics.json f1_score 1 0.069444\n", - "\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc exp diff exp2-tuning-logreg exp3-svm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build Plots\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:11:04.697127Z", - "start_time": "2020-10-22T14:11:04.694448Z" - } - }, - "outputs": [], - "source": [ - "from IPython.display import IFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Show" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:15:07.599224Z", - "start_time": "2020-10-22T14:15:05.455604Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "file:///home/alex/Dev/Projects/tutorials-dvc/dvc-3-automate-experiments/data/plots-show.html\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!dvc plots show --template confusion \"data/cm.csv\" -x actual -y predicted -o data/plots-show.html" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:15:07.664691Z", - "start_time": "2020-10-22T14:15:07.660138Z" - }, - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "IFrame(src='data/plots-show.html', width=800, height=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Diff" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:15:16.499865Z", - "start_time": "2020-10-22T14:15:15.626027Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "file:///Users/mnrozhkov/dev/dvc/dvc-3-automate-experiments/data/plots-diff.html\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# Build metircs plots for all 3 experiments\n", - "!dvc plots diff -t confusion -o data/plots-diff.html exp1-ratio-features exp3-svm -x predicted" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "ExecuteTime": { - "end_time": "2020-10-22T14:15:23.321541Z", - "start_time": "2020-10-22T14:15:23.317724Z" - }, - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "IFrame(src='data/plots-diff.html', width=800, height=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "230.938px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/params.yaml b/params.yaml index 9bb859c..6d6738a 100644 --- a/params.yaml +++ b/params.yaml @@ -1,8 +1,8 @@ - data_load: raw_data_path: data/iris.csv classes_names_path: data/classes.json + featurize: features_path: data/iris_featurized.csv target_column: target diff --git a/requirements.txt b/requirements.txt index 613946b..79a1c05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,10 @@ -dvc==2.0.5 -joblib==1.0.1 -jupyter==1.0.0 -jupyter_contrib_nbextensions==0.5.1 -ipykernel==5.5.0 -matplotlib==3.3.4 -numpy==1.20.1 -pandas==1.2.3 -python-box==5.3.0 -pyyaml==5.4.1 -scikit-learn==0.24.1 -scipy==1.6.1 -tqdm==4.59.0 \ No newline at end of file +dvc==2.57.2 +joblib==1.2.0 +matplotlib==3.7.1 +numpy==1.24.3 +pandas==2.0.1 +python-box==7.0.1 +pyyaml==6.0 +scikit-learn==1.2.2 +scipy==1.10.1 +tqdm==4.65.0 \ No newline at end of file From 4099335ca2a77a21f52f6cde30b47e5b73206f69 Mon Sep 17 00:00:00 2001 From: AlexanderKolosov Date: Fri, 19 May 2023 20:23:47 +0900 Subject: [PATCH 8/8] Update structure --- .gitignore | 4 +++- data/.gitignore | 1 + models/.gitignore | 1 + params.yaml | 6 +++--- reports/.gitignore | 1 + 5 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 data/.gitignore create mode 100644 models/.gitignore create mode 100644 reports/.gitignore diff --git a/.gitignore b/.gitignore index eb43ac8..74712a8 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ __pycache__ # Project -data +data/* +models/* +reports/* diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..b722e9e --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 0000000..b722e9e --- /dev/null +++ b/models/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file diff --git a/params.yaml b/params.yaml index 6d6738a..933640b 100644 --- a/params.yaml +++ b/params.yaml @@ -15,9 +15,9 @@ data_split: train: - model_path: data/model.joblib + model_path: models/model.joblib evaluate: - metrics_file: data/metrics.json - confusion_matrix: data/cm.csv + metrics_file: reports/metrics.json + confusion_matrix: reports/cm.csv diff --git a/reports/.gitignore b/reports/.gitignore new file mode 100644 index 0000000..b722e9e --- /dev/null +++ b/reports/.gitignore @@ -0,0 +1 @@ +!.gitignore \ No newline at end of file