neuralmagic · dsikka · Nov 10, 2023 · Nov 6, 2023 · Nov 7, 2023 · Nov 8, 2023
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
@@ -158,4 +158,5 @@ def next(
 
     @staticmethod
     def validate(ops) -> bool:
+        # TODO: still needs to be implemented for the GraphRouter
         pass
diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -24,7 +24,7 @@
 from deepsparse.v2.operators import Operator
 
 
-__all__ = ["KVCacheCreator"]
+__all__ = ["KVCacheCreator", "KVCacheCreatorInput"]
 
 
 class KVCacheCreatorOutput(BaseModel):

diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -29,7 +29,7 @@
 )
 
 
-__all__ = ["NLEngineOperator"]
+__all__ = ["NLEngineOperator", "NlEngineInput"]
 
 
 class NlEngineInput(BaseModel):

diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -11,11 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 from typing import Any
 
 import numpy
 
 from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.transformers.utils.helpers import set_generated_length
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.text_generation import TokenGeneratorOperator
 from deepsparse.v2.utils import InferenceState
@@ -31,9 +33,9 @@ def __init__(
         prompt_sequence_length: int,
         sequence_length: int,
     ):
-        self.prompt_sequence_length = prompt_sequence_length
         self.sequence_length = sequence_length
         self.token_generator_creator = token_generator
+        self.prompt_sequence_length = prompt_sequence_length
 
     def can_operate(self, inp: Any):
         kv_cache = inp.get("kv_cache")
@@ -47,49 +49,6 @@ def can_operate(self, inp: Any):
             return True
         return False
 
-    @staticmethod
-    def set_generated_length(
-        max_length: int,
-        prompt_tokens_length: int,
-        sequence_length: int,
-        prompt_sequence_length: int,
-        max_new_tokens: int,
-        finish_reason_choices: "FinishReason",  # noqa
-    ):
-        """
-        Determine the length of the generated tokens. The hard cap on the total number
-        of tokens is based on the sequence length. If max_length is provided and is less
-        than the sequence length, it will be used to cap the total number of tokens
-        generated. If it is not provided, the max_new_tokens attribute will be used and
-        also capped by the sequence length.
-
-        :param max_length: max_length attribute, provided as input during inference
-        :param prompt_tokens_length: the number of prompt tokens used as part of the
-            generated output
-        :param sequence_length: the sequence length used for the pipeline
-        :param prompt_sequence_length: the prompt sequence length used for the pipeline
-        :param max_new_tokens: the max_new_tokens attribute, which may be provided
-        as part of the input during inference
-        """
-        if max_length:
-            # if max_length provided, use that to cap total tokens generated
-            max_tokens = max_length
-            finish_reason = finish_reason_choices.LENGTH
-        else:
-            # if not provided, max tokens is based on max_new_tokens + prompt tokens
-            max_tokens = (
-                min(max_new_tokens, sequence_length - prompt_sequence_length)
-                + prompt_tokens_length
-            )
-            finish_reason = finish_reason_choices.MAX_NEW_TOKENS
-
-        # hard model/pipeline cap
-        return (
-            (sequence_length, finish_reason_choices.CAPACITY)
-            if sequence_length < max_tokens
-            else (max_tokens, finish_reason)
-        )
-
     def run(
         self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs
     ):
@@ -107,13 +66,13 @@ def run(
             logits_shape=prompt_logits[0, -1, :].shape,
             deterministic=not generation_config.do_sample,
             sampling_temperature=generation_config.temperature,
-            tokens=tokens,
+            tokens=copy.copy(tokens),
             **inference_state.current_state,
         )
         token_generator = token_generator_creator_output.get("token_generator")
         token_generator.generate(prompt_logits[0, -1, :])
 
-        max_tokens, length_finish_reason = PrepareGeneration.set_generated_length(
+        max_tokens, length_finish_reason = set_generated_length(
             max_length=generation_config.max_length,
             prompt_tokens_length=1,
             max_new_tokens=generation_config.max_new_tokens,
@@ -131,7 +90,6 @@ def run(
             "finished_reason": [],
             "token_generator": token_generator,
         }
-
         output = {
             "tokens": token_generator.tokens,
             "kv_cache": kv_cache,

diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -26,6 +26,9 @@
 from deepsparse.v2.operators import Operator
 
 
+__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"]
+
+
 class GenerationDefaults:
     num_return_sequences = 1
     max_length = 100
@@ -38,9 +41,6 @@ class GenerationDefaults:
     temperature = 1.0
 
 
-__all__ = ["ProcessInputsTextGeneration"]
-
-
 class ProcessInputsTextGeneration(Operator):
     """
     Input processing operator. Responsible for tokenizing the input, handling the
@@ -54,10 +54,10 @@ class ProcessInputsTextGeneration(Operator):
     def __init__(
         self,
         tokenizer: transformers.PreTrainedTokenizerBase,
+        sequence_length: int,
         generation_config: Union[
             str, pathlib.Path, Dict, transformers.GenerationConfig
-        ],
-        sequence_length: int,
+        ] = None,
     ):
         self.generation_config = generation_config
         self.tokenizer = tokenizer

diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import numpy
+from transformers import AutoTokenizer
+
+import pytest
+from deepsparse.transformers.helpers import get_deployment_path
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import initialize_kv_cache_state
+from deepsparse.v2 import InferenceState, PipelineState
+from deepsparse.v2.text_generation import (
+    GenerationDefaults,
+    NLEngineOperator,
+    TokenGeneratorOperator,
+)
+
+
+@pytest.fixture(scope="module")
+def text_generation_attributes():
+    sequence_length = 5
+    prompt_sequence_length = 1
+    return sequence_length, prompt_sequence_length
+
+
+@pytest.fixture(scope="module")
+def model_attributes(text_generation_attributes):
+    model_path = "hf:mgoin/TinyStories-1M-deepsparse"
+    sequence_length, _ = text_generation_attributes
+    deployment_path, model_path = get_deployment_path(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        deployment_path,
+        trust_remote_code=False,
+        model_max_length=sequence_length,
+    )
+
+    tokenizer.padding_side = "left"
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer, model_path
+
+
+@pytest.fixture(scope="module")
+def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes):
+    seq_length, _ = text_generation_attributes
+    _, model_path = model_attributes
+
+    nl_engine_operator = NLEngineOperator(
+        sequence_length=seq_length, input_ids_length=1, model_path=model_path
+    )
+    return nl_engine_operator
+
+
+@pytest.fixture(scope="module")
+def pipeline_state(single_token_engine_no_internal_cache):
+    pipeline_state = PipelineState()
+    pipeline_state_vals = {}
+    pipeline_state_vals[
+        "onnx_input_names_no_cache"
+    ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache
+    pipeline_state_vals[
+        "cache_shape"
+    ] = single_token_engine_no_internal_cache.cache_shape
+    pipeline_state_vals[
+        "output_names"
+    ] = single_token_engine_no_internal_cache.output_names
+    pipeline_state_vals[
+        "kv_cache_data_type"
+    ] = single_token_engine_no_internal_cache.kv_cache_data_type
+    pipeline_state.create_state(pipeline_state_vals)
+    return pipeline_state
+
+
+@pytest.fixture(scope="module")
+def large_prompt():
+    prompt = "Hello, how are you doing today?"
+    generation_config = {"top_p": 0, "top_k": 0, "max_length": 10}
+    return TextGenerationInput(prompt=prompt, generation_config=generation_config)
+
+
+@pytest.fixture(scope="module")
+def small_prompt():
+    prompt = "Hello"
+    return TextGenerationInput(prompt=prompt)
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_three_tokens_processed():
+    kv_cache = DecoderKVCache()
+    kv_cache.setup(
+        state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])},
+        num_processed_tokens=3,
+    )
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes):
+    seq_len, prompt_seq_len = text_generation_attributes
+    kv_cache = DecoderKVCache()
+    kv_cache_state = initialize_kv_cache_state(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+        length=seq_len - prompt_seq_len,
+        empty=False,
+    )
+    kv_cache.setup(state=kv_cache_state)
+    return kv_cache
+
+
+@pytest.fixture(scope="module")
+def mock_tokens():
+    return [15496]
+
+
+@pytest.fixture(scope="module")
+def mock_tokens_multiple():
+    return [15496, 15496, 15496]
+
+
+@pytest.fixture(scope="module")
+def mock_inference_state():
+    generation_config = GenerationDefaults()
+    inference_state = InferenceState()
+    inference_state.create_state({})
+    inference_state.update_state({"generation_config": generation_config})
+    return inference_state
+
+
+@pytest.fixture(scope="module")
+def mock_token_generator(model_attributes, mock_tokens_multiple):
+    tokenizer, _ = model_attributes
+    token_generator_creator = TokenGeneratorOperator()
+    prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))
+    token_generator_creator_output = token_generator_creator.run(
+        logits_shape=prompt_logits[0, -1, :].shape,
+        deterministic=True,
+        sampling_temperature=1.0,
+        tokens=copy.copy(mock_tokens_multiple),
+    )
+    return token_generator_creator_output.get("token_generator")
+
+
+@pytest.fixture(scope="module")
+def mock_logits(model_attributes):
+    tokenizer, _ = model_attributes
+    return numpy.random.rand(1, 1, len(tokenizer))
diff --git a/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput
+
+
+def test_kv_cache_creation(
+    text_generation_attributes, model_attributes, pipeline_state
+):
+    """
+    Check if the KVCacheCreator successfully creates a kv_cache object, given the
+    single_token_engine attributes stored in the pipeline_state.
+    """
+    seq_length, prompt_seq_len = text_generation_attributes
+    tokenizer, _ = model_attributes
+    kv_cache_creator = KVCacheCreator(
+        tokenizer=tokenizer,
+        prompt_sequence_length=prompt_seq_len,
+        sequence_length=seq_length,
+        internal_kv_cache=False,
+    )
+
+    assert kv_cache_creator.input_schema == KVCacheCreatorInput
+    kv_cache = kv_cache_creator.run(
+        cache_shape=pipeline_state.current_state.get("cache_shape"),
+        kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"),
+        output_names=pipeline_state.current_state.get("output_names"),
+    )
+    assert kv_cache.get("kv_cache")
+    assert kv_cache.get("kv_cache").total_num_processed_tokens == 0
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,7 +29,7 @@ @@
     )
-    __all__ = ["NLEngineOperator"]
+    __all__ = ["NLEngineOperator", "NlEngineInput"]
     class NlEngineInput(BaseModel):
@@ Expand Down @@