{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Copyright 2019 Google Inc. All Rights Reserved.\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# http://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Install Pipeline SDK - This only needs to be ran once in the enviroment. \n", "# you can find the latest package @ https://github.com/kubeflow/pipelines/releases\n", "#KFP_PACKAGE = 'https://storage.googleapis.com/ml-pipeline/release/0.1.20/kfp.tar.gz'\n", "#!pip3 install $KFP_PACKAGE --upgrade" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import kfp\n", "import kfp.dsl as dsl\n", "from kfp.gcp import use_gcp_secret\n", "from kubernetes import client as k8s_client\n", "from kfp import compiler\n", "from kfp import notebook\n", "from kfp import components as comp" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "PROJECT_NAME='chavoshi-dev-2'\n", "DLMODEL_IMAGE = 'tensorflow/tensorflow:1.12.0-py3'\n", "#DLMODEL_IMAGE = 'tensorflow/tensorflow:1.12.0-py3-gpu'\n", "GCLOUD_SDK = 'google/cloud-sdk:latest'\n", "NOOP_IMAGE = 'ubuntu:16.04'\n", "EXPERIMENT_NAME = 'Image_classification'\n", "LOCAL_PATH = '/mnt/vol'\n", "IMAGE_FOLDER = 'small_bolt_images'\n", "IMAGE_SOURCE = 'gs://cisco-live-2019-demo/' + IMAGE_FOLDER\n", "OUTPUT_DIR = 'gs://chavoshi-dev-mlpipeline/%s' % EXPERIMENT_NAME # Such as gs://bucket/objact/path\n", "BASE_IMAGE='gcr.io/%s/pusherbase:latest' % PROJECT_NAME\n", "TARGET_IMAGE='gcr.io/%s/pusher:latest' % PROJECT_NAME" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from kfp.onprem import mount_pvc" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#def copy_source_data_op(step_name='load-data'):\n", "# return dsl.ContainerOp(\n", "# name = step_name,\n", "# image = GCLOUD_SDK,\n", "# command=['sh', '-c'],\n", "# arguments = [ 'rm -rf ' +LOCAL_PATH+ '/* ' + '&& gsutil -m cp -r -n '+IMAGE_SOURCE+' '+LOCAL_PATH +' && rm -rf /mnt/vol/saved_model && rm -rf /mnt/vol/retrain_logs && gcloud auth activate-service-account --key-file /secret/gcp-credentials/user-gcp-sa.json && gsutil -m rm -r gs://test-gtc-demo-2019/retrain_logs/* 2> /dev/null || true']\n", "# ).add_volume(k8s_client.V1Volume(name='workdir', persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", "# ).add_volume_mount(k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')).apply(use_gcp_secret('user-gcp-sa'))\n", "def copy_source_data_op(step_name='load-data'):\n", " return dsl.ContainerOp(\n", " name = step_name,\n", " image = GCLOUD_SDK,\n", " command=['sh', '-c'],\n", " arguments = [ 'rm -rf ' +LOCAL_PATH+ '/* ' + '&& gsutil -m cp -r -n '+IMAGE_SOURCE+' '+LOCAL_PATH +' && rm -rf /mnt/vol/saved_model && rm -rf /mnt/vol/retrain_logs && gcloud auth activate-service-account --key-file /secret/gcp-credentials/user-gcp-sa.json && gsutil -m rm -r gs://test-gtc-demo-2019/retrain_logs/* 2> /dev/null || true']\n", " ).apply(mount_pvc(pvc_name='pvccc', volume_mount_path=LOCAL_PATH)).apply(use_gcp_secret('user-gcp-sa'))\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gs://chavoshi-dev-mlpipeline/Image_classification gcr.io/chavoshi-dev-2/pusherbase:latest\n" ] } ], "source": [ "print(OUTPUT_DIR, BASE_IMAGE)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2019-05-14 12:24:47:INFO:Checking path: gs://chavoshi-dev-mlpipeline/Image_classification...\n", "2019-05-14 12:24:47:INFO:Generate build files.\n", "2019-05-14 12:24:47:INFO:Start a kaniko job for build.\n", "2019-05-14 12:24:47:INFO:Cannot Find local kubernetes config. Trying in-cluster config.\n", "2019-05-14 12:24:47:INFO:Initialized with in-cluster config.\n", "2019-05-14 12:24:52:INFO:5 seconds: waiting for job to complete\n", "2019-05-14 12:24:57:INFO:10 seconds: waiting for job to complete\n", "2019-05-14 12:25:02:INFO:15 seconds: waiting for job to complete\n", "2019-05-14 12:25:07:INFO:20 seconds: waiting for job to complete\n", "2019-05-14 12:25:12:INFO:25 seconds: waiting for job to complete\n", "2019-05-14 12:25:17:INFO:30 seconds: waiting for job to complete\n", "2019-05-14 12:25:22:INFO:35 seconds: waiting for job to complete\n", "2019-05-14 12:25:27:INFO:40 seconds: waiting for job to complete\n", "2019-05-14 12:25:32:INFO:45 seconds: waiting for job to complete\n", "2019-05-14 12:25:37:INFO:50 seconds: waiting for job to complete\n", "2019-05-14 12:25:42:INFO:55 seconds: waiting for job to complete\n", "2019-05-14 12:25:47:INFO:60 seconds: waiting for job to complete\n", "2019-05-14 12:25:47:INFO:Kaniko job complete.\n", "2019-05-14 12:25:48:INFO:Build image complete.\n" ] } ], "source": [ "%%docker {BASE_IMAGE} {OUTPUT_DIR}\n", "FROM tensorflow/tensorflow:1.12.0-py3\n", "RUN pip3 install tensorflow_hub &&\\\n", " curl -O https://raw.githubusercontent.com/tensorflow/hub/master/examples/image_retraining/retrain.py\n", "ENTRYPOINT [\"python\", \"retrain.py\"]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/mnt/vol small_bolt_images\n" ] } ], "source": [ "print(LOCAL_PATH, IMAGE_FOLDER)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# this first step is to create buttlenecks, by setting training steps to zero \n", "# this is done separately as in the following steps we may choose to run multiple \n", "# steps with various hyper parameters. \n", "\n", "def pre_process_op(step_name='preprocess-data'):\n", " return dsl.ContainerOp(\n", " name = step_name,\n", " image = BASE_IMAGE,\n", " arguments = [\n", " '--image_dir', LOCAL_PATH+'/'+IMAGE_FOLDER,\n", " '--output_labels', LOCAL_PATH+'/output_labels.txt',\n", " '--summaries_dir', LOCAL_PATH+'/retrain_logs',\n", " '--how_many_training_steps', 0,\n", " '--learning_rate', 0.01,\n", " '--bottleneck_dir', LOCAL_PATH+'/bottleneck',\n", " '--tfhub_module', 'https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/classification/2',\n", " #'--saved_model_dir', LOCAL_PATH+'/saved_model',\n", " ]\n", " ).add_volume(k8s_client.V1Volume(name='workdir', persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", " ).add_volume_mount(k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')\n", " ).apply(use_gcp_secret('user-gcp-sa')\n", " #).set_gpu_limit('1')\n", " ).set_cpu_request('2')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# multiple instace of training can run in paralel with various hyper parameters ex learning rate \n", "# however the same tfhub module should be used as in buttle neck creation step\n", "def train_op(step_name='train'):\n", " return dsl.ContainerOp(\n", " name = step_name,\n", " image = BASE_IMAGE,\n", " arguments = [\n", " '--image_dir', LOCAL_PATH+'/'+IMAGE_FOLDER,\n", " '--output_labels', LOCAL_PATH+'/output_labels.txt',\n", " '--summaries_dir', LOCAL_PATH+'/retrain_logs',\n", " '--how_many_training_steps', 10,\n", " '--learning_rate', 0.01,\n", " '--bottleneck_dir', LOCAL_PATH+'/bottleneck',\n", " '--tfhub_module', 'https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/classification/2',\n", " '--saved_model_dir', LOCAL_PATH+'/saved_model',\n", " #'--saved_model_dir', OUTPUT_DIR+'/BOLT/saved_model',\n", "\n", " ]\n", " ).add_volume(k8s_client.V1Volume(name='workdir', persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", " ).add_volume_mount(k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')\n", " ).apply(use_gcp_secret('user-gcp-sa')\n", " ).set_gpu_limit('1')\n", " #).set_cpu_request('2')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def tensorboard_op2(step_name='tensorboard'):\n", " return dsl.ContainerOp(\n", " name = step_name,\n", " image = GCLOUD_SDK,\n", " command=['sh', '-c'],\n", " arguments = ['''echo '{\"outputs\": [{\"source\": \"gs://test-gtc-demo-2019/retrain_logs\", \n", " \"type\": \"tensorboard\"}]}'>/mlpipeline-ui-metadata.json && gcloud auth activate-service-account --key-file '/secret/gcp-credentials/user-gcp-sa.json' && gsutil -m cp -R mnt/vol/retrain_logs gs://test-gtc-demo-2019 ''']\n", " ).add_volume(\n", " k8s_client.V1Volume(\n", " name='workdir', \n", " persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", " ).add_volume_mount(\n", " k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')\n", " ).apply(use_gcp_secret('user-gcp-sa'))\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def tensorboard_func():\n", " from tensorflow.python.lib.io import file_io\n", " import json\n", " \n", " # Exports a sample tensorboard:\n", " metadata = {\n", " 'outputs' : [{\n", " 'type': 'tensorboard',\n", " 'source': 'gs://test-gtc-demo-2019/retrain_logs',\n", " }]\n", " }\n", " \n", " with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:\n", " json.dump(metadata, f) \n", "\n", " import os\n", " \n", " #TODO: copy training files\n", " \n", "tensorboard_op = comp.func_to_container_op(tensorboard_func, base_image='tensorflow/tensorflow:1.12.0-py3')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# this step simply changes the permissions on the drive to make accessible jupyter hub and other locations\n", "def tflite_transform_op():\n", " import tensorflow as tf\n", "\n", " converter = tf.lite.TFLiteConverter.from_saved_model('mnt/vol/retrain')\n", " tflite_model = converter.convert()\n", " open(\"converted_model.tflite\", \"wb\").write(tflite_model)\n", " return \n", " \n", "tensorboard_op = comp.func_to_container_op(tensorboard_func, base_image='tensorflow/tensorflow:1.12.0-py3')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# this step simply changes the permissions on the drive to make accessible jupyter hub and other locations\n", "def publish_op(step_name='publish content'):\n", " return dsl.ContainerOp(\n", " name = step_name,\n", " image = GCLOUD_SDK,\n", " command=['sh', '-c'],\n", " arguments = ['chmod -R 0777 /mnt/vol/ ']\n", " ).add_volume(k8s_client.V1Volume(name='workdir', persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", " ).add_volume_mount(k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')).apply(use_gcp_secret('user-gcp-sa'))\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='TFHub Image Classifier',\n", " description='Users TFHub based models such as Mobilenetv2 and NasNet to train an image classifer.'\n", ")\n", "def tfhub_image_classifier_dag(\n", " model_version: dsl.PipelineParam = dsl.PipelineParam(name='model-version', value='1'),\n", "):\n", " #copy source data\n", " copy_source_data = copy_source_data_op()\n", " \n", " \n", " pre_process_data = pre_process_op()\n", " \n", " pre_process_data.after(copy_source_data)\n", " \n", " \n", " train = train_op()\n", " \n", " train.after(pre_process_data) \n", " tensorboard = tensorboard_op2()\n", " tensorboard.after(train)\n", " \n", " tflite = tflite_transform_op().add_volume(k8s_client.V1Volume(name='workdir', persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(claim_name='nfs'))\n", " ).add_volume_mount(k8s_client.V1VolumeMount(mount_path=LOCAL_PATH, name='workdir')\n", "\n", " tflite.after(train)\n", " \n", " \n", " publish = publish_op()\n", " publish.after(train)\n", " \n", " \n", " #deploy = deploy_op()\n", " #deploy.after(publish)\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()\n", "exp = client.list_experiments().experiments[0]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "from kfp import compiler\n", "compiler.Compiler().compile(tfhub_image_classifier_dag, 'tfhub_image_classifier_dag.tar.gz')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Run link <a href=\"/pipeline/#/runs/details/b9b5cd37-4c18-11e9-8554-42010a8a01f3\" target=\"_blank\" >here</a>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "run = client.run_pipeline(exp.id, 'TF Hub Image Classifier', 'tfhub_image_classifier_dag.tar.gz',\n", " params={})" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }