From 513fabcc3c611be1abbf4b47dde093e3f0c6c9b6 Mon Sep 17 00:00:00 2001
From: Shira Guskin <30695324+shira-g@users.noreply.github.com>
Date: Tue, 24 Dec 2024 22:31:58 -0800
Subject: [PATCH] Added batch evaluation for speculative decoding notebook
 (#2609)

Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
---
 .../speculative-sampling.ipynb                | 334 +++++++++++++++---
 1 file changed, 276 insertions(+), 58 deletions(-)

diff --git a/notebooks/speculative-sampling/speculative-sampling.ipynb b/notebooks/speculative-sampling/speculative-sampling.ipynb
index 58d4178ac6b..403be4b6aa0 100644
--- a/notebooks/speculative-sampling/speculative-sampling.ipynb
+++ b/notebooks/speculative-sampling/speculative-sampling.ipynb
@@ -70,14 +70,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "4421fc85-bed6-4a62-b8fa-19c7ba474891",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": null,
+   "id": "dfd782ed",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -Uq \"openvino>=2024.5.0\" \"openvino-tokenizers>=2024.5.0\" \"openvino-genai>=2024.5.0\" huggingface_hub"
+    "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
    ]
   },
   {
@@ -97,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "74bb9f96",
    "metadata": {},
    "outputs": [],
@@ -133,7 +131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
    "metadata": {
     "tags": []
@@ -142,15 +140,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e83ffbfc2136400194e2b1da63bccb26",
+       "model_id": "37ad0b345de94225892c9d47519a9164",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', options=('CPU',), value='CPU')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -191,15 +189,7 @@
    "execution_count": null,
    "id": "553148f5",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " it is made of gas. The gas is heated to a high temperature and then cooled. The gas is yellow because it has a band of light called the \"Bondeson Pendulum Effect.\" The Bondeson Pendulum Effect is caused by the light waves bouncing off of the gas molecules. The light waves bounce off of the gas molecules in different ways, some of the light waves get scattered, and some of the light waves get reflected. The light waves that get scattered and reflected combine to"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import openvino_genai as ov_genai\n",
     "import time\n",
@@ -243,15 +233,7 @@
    "execution_count": null,
    "id": "c40d9901-ceb2-4c4c-a686-303590292ab3",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generation time: 18.44s\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import gc\n",
     "\n",
@@ -282,15 +264,7 @@
    "execution_count": null,
    "id": "9fde1b3c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " it is made of gas. The gas is heated to a high temperature and then cooled. The gas changes from a hot gas to a cold gas and then from a cold gas to a hot gas. The gas is very hot when it changes from a hot gas to a cold gas and very cold when it changes from a cold gas to a hot gas. When the gas changes from a hot gas to a cold gas it becomes yellow. When the gas changes from a cold gas to a hot gas it"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "scheduler_config = ov_genai.SchedulerConfig()\n",
     "# cache params\n",
@@ -312,72 +286,316 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "d9739752-0bd8-4be7-a4cc-c076228bfc91",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Generation time: {end_time - start_time:.2f}s\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9061f205-862a-450e-a102-4d3ea162f588",
+   "metadata": {},
+   "source": [
+    "Alternative approach, Dynamic Speculative Decoding, described in the [paper](https://arxiv.org/abs/2405.04304) is based on heuristics and adjusts the number of candidate tokens for the next iteration based on the acceptance rate of the current iteration. If all speculative tokens are correct, the number of candidate tokens increases; otherwise, it decreases. For adjusting number of tokens `assistant_confidence_threshold` parameters should be used. If the assistant model's confidence in its prediction for the current token is lower than this threshold, the assistant model stops the current token generation iteration, even if the number of `num_assistant_tokens` is not yet reached.  You can find more details in this [blog post](https://huggingface.co/blog/dynamic_speculation_lookahead). This approach has advantages for cases, when optimal number of tokens for draft model is unknown and draft model has low acceptance rate.\n",
+    "\n",
+    ">*Note*: For small and fast draft models like FastDraft, you may not see benefit for dynamic speculative decoding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9c011ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = ov_genai.GenerationConfig()\n",
+    "config.max_new_tokens = 100\n",
+    "config.assistant_confidence_threshold = 0.05\n",
+    "start_time = time.perf_counter()\n",
+    "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
+    "end_time = time.perf_counter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5803b7c-b38b-474d-9604-363e3813b6b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Generation time: {end_time - start_time:.2f}s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd59ed90",
+   "metadata": {},
+   "source": [
+    "## Evaluate Speculative Decoding on multiple examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88772975-bce3-49c8-bae7-28cd3d1d44e1",
+   "metadata": {},
+   "source": [
+    "Configure the data type and the number of examples to run:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0f65ad3139a477282c002eafe409d94",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_samples_to_select = 50\n",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "\n",
+    "data_options = [\"Code\", \"Text\"]\n",
+    "data_type = widgets.Dropdown(\n",
+    "    options=data_options,\n",
+    "    value=data_options[0],\n",
+    "    description=\"Data type:\",\n",
+    "    disabled=False,\n",
+    ")\n",
+    "data_type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f3486cd",
+   "metadata": {},
+   "source": [
+    "Load the dataset and prepare the prompts:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "13f03634",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Generation time: 15.62s\n"
+      "loading dataset...\n",
+      "Done\n"
      ]
     }
    ],
    "source": [
-    "print(f\"Generation time: {end_time - start_time:.2f}s\")"
+    "from datasets import load_dataset\n",
+    "\n",
+    "print(\"loading dataset...\")\n",
+    "\n",
+    "if data_type.value == \"Code\":\n",
+    "    ds = load_dataset(\"openai_humaneval\", split=\"test\")\n",
+    "    prompts = ds[\"prompt\"]\n",
+    "    prompts = [\"<s>\" + prompts[i] for i in range(num_samples_to_select)]\n",
+    "else:\n",
+    "    ds = load_dataset(\"abisee/cnn_dailymail\", \"3.0.0\", split=\"test\")\n",
+    "    prompts = ds[\"article\"]\n",
+    "    prompts = [\n",
+    "        \"<|user|> ###\\nArticle: \" + prompts[i] + \"\\n\\nSummarize the above article in 5 sentence.\\n<|end|><|assistant|>\" for i in range(num_samples_to_select)\n",
+    "    ]\n",
+    "print(\"Done\")"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
-   "id": "9061f205-862a-450e-a102-4d3ea162f588",
+   "id": "be4e20d6",
    "metadata": {},
    "source": [
-    "Alternative approach, Dynamic Speculative Decoding, described in the [paper](https://arxiv.org/abs/2405.04304) is based on heuristics and adjusts the number of candidate tokens for the next iteration based on the acceptance rate of the current iteration. If all speculative tokens are correct, the number of candidate tokens increases; otherwise, it decreases. For adjusting number of tokens `assistant_confidence_threshold` parameters should be used. If the assistant model's confidence in its prediction for the current token is lower than this threshold, the assistant model stops the current token generation iteration, even if the number of `num_assistant_tokens` is not yet reached.  You can find more details in this [blog post](https://huggingface.co/blog/dynamic_speculation_lookahead). This approach has advantages for cases, when optimal number of tokens for draft model is unknown and draft model has low acceptance rate.\n",
+    "Run auto-regressive generation and get total runtime per example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1f4ea9e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Auto-Regressive generation...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00,  5.32s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "9"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openvino_genai as ov_genai\n",
+    "import time\n",
+    "from tqdm import tqdm\n",
     "\n",
-    ">*Note*: For small and fast draft models like FastDraft, you may not see benefit for dynamic speculative decoding."
+    "print(\"Running Auto-Regressive generation...\")\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
+    "\n",
+    "config = ov_genai.GenerationConfig()\n",
+    "config.max_new_tokens = 330\n",
+    "\n",
+    "times_auto_regressive = []\n",
+    "for prompt in tqdm(prompts):\n",
+    "    start_time = time.perf_counter()\n",
+    "    result = pipe.generate(prompt, config)\n",
+    "    end_time = time.perf_counter()\n",
+    "    times_auto_regressive.append(end_time - start_time)\n",
+    "print(\"Done\")\n",
+    "\n",
+    "import gc\n",
+    "\n",
+    "del pipe\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35dbba92",
+   "metadata": {},
+   "source": [
+    "Now run generation with speculative-decoding:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "f9c011ec",
+   "execution_count": 7,
+   "id": "d73e9f37",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " it is made of gas. The gas is heated to a high temperature and then cooled. The gas changes from a hot gas to a cold gas and then from a cold gas to a hot gas. The gas is very hot when it changes from a hot gas to a cold gas and very cold when it changes from a cold gas to a hot gas. The gas is very light and can float in the air. When the gas cools it becomes a liquid. The Sun is a huge sphere of"
+      "Running Speculative Decoding generation...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00,  2.25s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     }
    ],
    "source": [
+    "scheduler_config = ov_genai.SchedulerConfig()\n",
+    "# cache params\n",
+    "scheduler_config.cache_size = 0\n",
+    "scheduler_config.num_kv_blocks = 2048 // 8\n",
+    "scheduler_config.max_num_batched_tokens = 2048\n",
+    "\n",
+    "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
+    "\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
+    "\n",
     "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 100\n",
-    "config.assistant_confidence_threshold = 0.05\n",
-    "start_time = time.perf_counter()\n",
-    "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
-    "end_time = time.perf_counter()"
+    "config.max_new_tokens = 330\n",
+    "config.num_assistant_tokens = 5\n",
+    "\n",
+    "\n",
+    "times_speculative_decoding = []\n",
+    "print(\"Running Speculative Decoding generation...\")\n",
+    "for prompt in tqdm(prompts):\n",
+    "    start_time = time.perf_counter()\n",
+    "    result = pipe.generate(prompt, config)\n",
+    "    end_time = time.perf_counter()\n",
+    "    times_speculative_decoding.append((end_time - start_time))\n",
+    "print(\"Done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0f4da9c",
+   "metadata": {},
+   "source": [
+    "Now let's calculate the speedup:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "b5803b7c-b38b-474d-9604-363e3813b6b3",
+   "execution_count": 8,
+   "id": "ad898772",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Generation time: 17.97s\n"
+      "average speedup: 2.23\n"
      ]
     }
    ],
    "source": [
-    "print(f\"Generation time: {end_time - start_time:.2f}s\")"
+    "avg_speedup = sum([x / y for x, y in zip(times_auto_regressive, times_speculative_decoding)]) / len(prompts)\n",
+    "print(f\"average speedup: {avg_speedup:.2f}\")"
    ]
   }
  ],
@@ -397,7 +615,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.7"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",