From 53cb9eca9850ee9f79747799f7d8d5b5c48c6585 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 13:05:40 +0000
Subject: [PATCH 1/8] initial commit

---
 src/deepsparse/evaluation/cli.py              | 28 +++-----
 src/deepsparse/evaluation/evaluator.py        | 21 +++---
 src/deepsparse/evaluation/registry.py         |  9 +--
 src/deepsparse/evaluation/utils.py            | 64 +++++++++----------
 .../test_lm_evaluation_harness.py             |  6 +-
 tests/deepsparse/evaluation/test_evaluator.py | 22 ++++---
 tests/deepsparse/evaluation/test_utils.py     | 47 ++------------
 7 files changed, 74 insertions(+), 123 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index ed7ea72831..f37ed46d0c 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --target TARGET     A path to a remote or local directory containing ONNX/torch model
+    --target TARGET     A path to a remote or local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
                         The dataset to evaluate on. The user may pass multiple datasets
@@ -30,9 +30,7 @@
                         integration name that is registered in the evaluation registry
     -e ENGINE_TYPE, --engine_type ENGINE_TYPE
                         Inference engine to use for the evaluation. The default
-                        is the DeepSparse engine. If the evaluation should be run
-                        without initializing a pipeline (e.g. for the evaluation
-                        of a torch model), the engine type should be set to None
+                        is the DeepSparse engine.
     -s SAVE_PATH, --save_path SAVE_PATH
                         The path to save the evaluation results.
                         By default the results will be saved in the
@@ -90,10 +88,10 @@
     )
 )
 @click.option(
-    "--target",
+    "--model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX/torch model "
+    help="A path to a remote or local directory containing ONNX model "
     "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
@@ -118,9 +116,7 @@
     type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE]),
     default=DEEPSPARSE_ENGINE,
     help="The engine to use for the evaluation. The default is the "
-    "DeepSparse engine. If the evaluation should be run without "
-    "initializing a pipeline (e.g. for the evaluation of a torch "
-    "model), the engine type should be set to None",
+    "DeepSparse engine. ",
 )
 @click.option(
     "-s",
@@ -167,7 +163,7 @@
 )
 @click.argument("integration_args", nargs=-1, type=click.UNPROCESSED)
 def main(
-    target,
+    model_path,
     dataset,
     integration,
     engine_type,
@@ -183,14 +179,8 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Target to evaluate: {target}")
-    if engine_type:
-        _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
-    else:
-        _LOGGER.info(
-            "No engine type specified. The target "
-            "will be evaluated using the native framework"
-        )
+    _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}")
+    _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"
@@ -201,7 +191,7 @@ def main(
     )
 
     result: Result = evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         integration=integration,
         engine_type=engine_type,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 7bd56adf6e..9d1b3228a7 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Any, List, Optional, Union
 
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 from deepsparse.operators.engine_operator import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -30,11 +31,11 @@
 
 
 def evaluate(
-    target: Any,
+    model_path: Any,
     datasets: Union[str, List[str]],
     integration: Optional[str] = None,
     engine_type: Union[
-        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, None
+        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE
     ] = DEEPSPARSE_ENGINE,
     batch_size: int = 1,
     splits: Union[List[str], str, None] = None,
@@ -42,18 +43,18 @@ def evaluate(
     **kwargs,
 ) -> Result:
 
-    # if target is a string, turn it into an appropriate model/pipeline
+    # if target is a string, turn it into an appropriate pipeline
     # otherwise assume it is a model/pipeline
-    model = (
-        create_model_from_target(target, engine_type)
-        if isinstance(target, str)
-        else target
+    pipeline = (
+        create_pipeline(model_path, engine_type)
+        if isinstance(model_path, (Path, str))
+        else model_path
     )
 
-    eval_integration = EvaluationRegistry.resolve(model, datasets, integration)
+    eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
 
     return eval_integration(
-        model=model,
+        pipeline=pipeline,
         datasets=datasets,
         engine_type=engine_type,
         batch_size=batch_size,
diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py
index 5b6e45bc1c..2daabb69cc 100644
--- a/src/deepsparse/evaluation/registry.py
+++ b/src/deepsparse/evaluation/registry.py
@@ -15,8 +15,9 @@
 Implementation of a registry for evaluation functions
 """
 import logging
-from typing import Any, Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
+from deepsparse import Pipeline
 from sparsezoo.utils.registry import RegistryMixin
 
 
@@ -38,7 +39,7 @@ def load_from_registry(cls, name: str) -> Callable[..., "Result"]:  # noqa: F821
     @classmethod
     def resolve(
         cls,
-        model: Any,
+        pipeline: Pipeline,
         datasets: Union[str, List[str]],
         integration: Optional[str] = None,
     ) -> Callable[..., "Result"]:  # noqa: F821
@@ -59,12 +60,12 @@ def resolve(
                 "No integration specified, inferring the evaluation"
                 "function from the input arguments..."
             )
-            integration = resolve_integration(model, datasets)
+            integration = resolve_integration(pipeline, datasets)
 
             if integration is None:
                 raise ValueError(
                     "Unable to resolve an evaluation function for the given model. "
-                    "Specify an integration name or use a model that is supported "
+                    "Specify an integration name or use a pipeline that is supported "
                 )
             _LOGGER.info(f"Inferred the evaluation function: {integration}")
 
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 0534a9f9f3..7290f14adb 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -15,14 +15,11 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from transformers import AutoModelForCausalLM, PreTrainedModel
-
 from deepsparse import Pipeline
-from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE
 
 
 __all__ = [
-    "create_model_from_target",
+    "create_pipeline",
     "get_save_path",
     "args_to_dict",
     "resolve_integration",
@@ -50,7 +47,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
 
 
 def resolve_integration(
-    model: Union[Pipeline, PreTrainedModel], datasets: Union[str, List[str]]
+    pipeline: Pipeline, datasets: Union[str, List[str]]
 ) -> Union[str, None]:
     """
     Given a model and dataset, infer the name of the evaluation integration
@@ -64,21 +61,22 @@ def resolve_integration(
     :param datasets: The datasets to infer the integration for
     :return: The name of the integration to use or None if unable to infer
     """
-    if if_generative_language_model(model):
+    if if_generative_language_model(pipeline):
         return LM_EVALUATION_HARNESS
     return None
 
 
-def if_generative_language_model(model: Any) -> bool:
+def if_generative_language_model(pipeline: Pipeline) -> bool:
     """
     Checks if the model is a generative language model.
     """
-    if isinstance(model, Pipeline):
-        return model.__class__.__name__ == "TextGenerationPipeline"
-    elif isinstance(model, PreTrainedModel):
-        return "CausalLM" in model.__class__.__name__
-    else:
-        return False
+    pipeline_name = pipeline.__class__.__name__
+    if pipeline_name == "TextGenerationPipeline" or (
+        pipeline_name == "TextGenerationPipelineNoKVCache"
+    ):
+        return True
+
+    return False
 
 
 def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]:
@@ -126,34 +124,30 @@ def get_save_path(
     return os.path.join(base_path, file_name)
 
 
-def create_model_from_target(
-    target: str,
+def create_pipeline(
+    model_path: str,
     engine_type: Optional[str] = None,
     **kwargs,
-) -> Union[Pipeline, AutoModelForCausalLM]:
+) -> Pipeline:
     """
-    Create a model or a pipeline from a target path.
+    Create a pipeline for evaluation
 
-    Note: This function is currently limited to:
-        - creating pipelines of type 'text-generation'
-        - creating dense huggingface models of type 'AutoModelForCausalLM'
-    This function will be expanded in the future to support more
-    model types and frameworks.
+    Note: This function is currently primarily
+    focused on creating pipelines of type 'text-generation'
+    This function will be expanded in the future to support
+    more tasks and models
 
-    :param target: The target path to initialize the
+    :param model_path: The target path to initialize the
         text generation model from. This can be a local
         or remote path to the model or a sparsezoo stub
     :param engine_type: The engine type to initialize the model with.
-    :return: The initialized model
+    :return: The initialized pipeline
     """
-    if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
-        return Pipeline.create(
-            task="text-generation",
-            model_path=target,
-            sequence_length=kwargs.pop("sequence_length", 2048),
-            engine_type=engine_type,
-            batch_size=kwargs.pop("batch_size", 1),
-            **kwargs,
-        )
-    else:
-        return AutoModelForCausalLM.from_pretrained(target, **kwargs)
+    return Pipeline.create(
+        task=kwargs.pop("task", "text-generation"),
+        model_path=model_path,
+        sequence_length=kwargs.pop("sequence_length", 2048),
+        engine_type=engine_type,
+        batch_size=kwargs.pop("batch_size", 1),
+        **kwargs,
+    )
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 9fa9b494cf..db847af1ad 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -14,17 +14,17 @@
 
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 
 
 @pytest.mark.parametrize(
     "pipeline, model_torch",
     [
         (
-            create_model_from_target(
+            create_pipeline(
                 "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
             ),
-            create_model_from_target("roneneldan/TinyStories-1M"),
+            create_pipeline("roneneldan/TinyStories-1M"),
         )
     ],
 )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index dedd63fa36..f1bc0c277a 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -49,7 +49,7 @@ def dummy_integration(*args, **kwargs):
 
 
 @pytest.fixture()
-def target():
+def model_path():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
@@ -68,18 +68,18 @@ def unknown_integration_name():
     return "unknown_integration"
 
 
-def test_evaluate_unknown_integration(target, datasets, unknown_integration_name):
+def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name):
     with pytest.raises(KeyError):
         evaluate(
-            target=target,
+            model_path=model_path,
             datasets=datasets,
             integration=unknown_integration_name,
         )
 
 
-def test_evaluate(target, datasets, dummy_integration_name):
+def test_evaluate(model_path, datasets, dummy_integration_name):
     result = evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         integration=dummy_integration_name,
     )
@@ -91,11 +91,11 @@ def test_evaluate(target, datasets, dummy_integration_name):
     reason="lm_evaluation_harness not installed",
 )
 def test_evaluation_llm_evaluation_harness_integration_name(
-    target,
+    model_path,
     datasets,
 ):
     assert evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         limit=2,
         no_cache=True,
@@ -110,15 +110,17 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     "with importing functions that are decorated with "
     "click option where multiple=True",
 )
-def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serialization):
+def test_cli(
+    tmp_path, model_path, datasets, dummy_integration_name, type_serialization
+):
     from deepsparse.evaluation.cli import main
 
     runner = CliRunner()
     runner.invoke(
         main,
         [
-            "--target",
-            target,
+            "--model_path",
+            model_path,
             "--dataset",
             datasets[0],
             "--dataset",
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index f712dce0df..a16cb8ee32 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -23,23 +23,13 @@
 import pytest
 from deepsparse import Pipeline
 from deepsparse.evaluation.utils import (
-    create_model_from_target,
+    create_pipeline,
     get_save_path,
     if_generative_language_model,
     resolve_integration,
 )
 
 
-@pytest.fixture
-def llm_type_hf_model():
-    return AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")
-
-
-@pytest.fixture
-def not_llm_type_hf_model():
-    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-
-
 @pytest.fixture
 def llm_type_pipeline():
     return Pipeline.create(
@@ -49,25 +39,13 @@ def llm_type_pipeline():
     )
 
 
-def test_resolve_known_llm_model(llm_type_hf_model):
+def test_resolve_known_llm_pipeline(llm_type_pipeline):
     assert (
-        resolve_integration(model=llm_type_hf_model, datasets="")
+        resolve_integration(pipeline=llm_type_pipeline, datasets="")
         == "lm-evaluation-harness"
     )
 
 
-def test_resolve_unknown_model(not_llm_type_hf_model):
-    assert resolve_integration(model=not_llm_type_hf_model, datasets="") is None
-
-
-def test_if_generative_language_model_true(llm_type_hf_model):
-    assert if_generative_language_model(llm_type_hf_model)
-
-
-def test_if_generative_language_model_false(not_llm_type_hf_model):
-    assert not if_generative_language_model(not_llm_type_hf_model)
-
-
 def test_if_generative_language_pipeline_true(llm_type_pipeline):
     assert if_generative_language_model(llm_type_pipeline)
 
@@ -89,26 +67,11 @@ def pipeline_target():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
-@pytest.fixture
-def torch_target():
-    return "roneneldan/TinyStories-1M"
-
-
 def test_initialize_model_from_target_pipeline_onnx(pipeline_target):
-    model = create_model_from_target(pipeline_target, "onnxruntime")
+    model = create_pipeline(pipeline_target, "onnxruntime")
     assert model.ops.get("single_engine")._engine_type == "onnxruntime"
 
 
-def test_initialize_model_from_target_pipeline_deepsparse(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse")
-    assert model.ops.get("single_engine")._engine_type == "deepsparse"
-
-
 def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse", sequence_length=64)
+    model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64)
     assert model.ops.get("process_input").sequence_length == 64
-
-
-def test_initialize_model_from_target_torch(torch_target):
-    model = create_model_from_target(torch_target, "torch")
-    assert isinstance(model, GPTNeoForCausalLM)

From 6599f41cb08cdb2903420d8c17fb3486c6c395ac Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 14:03:14 +0000
Subject: [PATCH 2/8] add some more tests for hardening

---
 src/deepsparse/evaluation/cli.py              |  7 +++--
 src/deepsparse/evaluation/evaluator.py        | 21 +++++++++----
 .../pipelines/text_generation/pipeline.py     |  7 +++++
 .../text_generation/pipeline_no_kv_cache.py   |  8 +++++
 .../test_lm_evaluation_harness.py             |  4 ++-
 tests/deepsparse/evaluation/test_evaluator.py | 31 +++++++++++++++++--
 6 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index f37ed46d0c..e0e16cb4ab 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,8 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --target TARGET     A path to a remote or local directory containing ONNX model
+    --model_path MODEL_PATH 
+                        A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
                         The dataset to evaluate on. The user may pass multiple datasets
@@ -91,7 +92,7 @@
     "--model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX model "
+    help="A path to an ONNX model, local directory containing ONNX model"
     "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
@@ -191,7 +192,7 @@ def main(
     )
 
     result: Result = evaluate(
-        model_path=model_path,
+        model=model_path,
         datasets=datasets,
         integration=integration,
         engine_type=engine_type,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 9d1b3228a7..b513f07563 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 import logging
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
+from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
 from deepsparse.evaluation.utils import create_pipeline
@@ -31,7 +32,7 @@
 
 
 def evaluate(
-    model_path: Any,
+    model: Union[Pipeline, Path, str],
     datasets: Union[str, List[str]],
     integration: Optional[str] = None,
     engine_type: Union[
@@ -43,12 +44,20 @@ def evaluate(
     **kwargs,
 ) -> Result:
 
+    if isinstance(model, Pipeline):
+        _LOGGER.info(
+            "Passed a Pipeline object into evaluate function. This will "
+            "override the following arguments:"
+        )
+        batch_size = model.batch_size
+        _LOGGER.info(f"batch_size: {batch_size}")
+        engine_type = engine_type
+        _LOGGER.info(f"engine_type: {engine_type}")
+
     # if target is a string, turn it into an appropriate pipeline
-    # otherwise assume it is a model/pipeline
+    # otherwise assume it is a pipeline
     pipeline = (
-        create_pipeline(model_path, engine_type)
-        if isinstance(model_path, (Path, str))
-        else model_path
+        create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model
     )
 
     eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index 2c858c901b..bbc0e8ba15 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,6 +357,13 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+    def batch_size(self) -> int:
+        return self.ops["single_engine"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["single_engine"]._engine_type
+
     def _get_continuous_batching_scheduler(
         self, batch_sizes: List[int], engines: List[EngineOperator]
     ) -> ContinuousBatchingScheduler:
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
index 7f6cb9db5f..c6cbc3dd59 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
@@ -127,3 +127,11 @@ def expand_inputs(self, items, batch_size):
         out, orig_batch_size = split_engine_inputs(items, batch_size)
         combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
         return combined_batches, orig_batch_size
+
+    @property
+    def batch_size(self) -> int:
+        return self.ops["engine_operator"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["engine_operator"]._engine_type
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index db847af1ad..3b9016294f 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from transformers import AutoModelForCausalLM
+
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 from deepsparse.evaluation.utils import create_pipeline
@@ -24,7 +26,7 @@
             create_pipeline(
                 "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
             ),
-            create_pipeline("roneneldan/TinyStories-1M"),
+            AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"),
         )
     ],
 )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index f1bc0c277a..816ad075e0 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -29,6 +29,7 @@
     Metric,
     Result,
 )
+from deepsparse.pipeline import Pipeline
 
 
 @EvaluationRegistry.register()
@@ -71,7 +72,7 @@ def unknown_integration_name():
 def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name):
     with pytest.raises(KeyError):
         evaluate(
-            model_path=model_path,
+            model=model_path,
             datasets=datasets,
             integration=unknown_integration_name,
         )
@@ -79,7 +80,31 @@ def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_
 
 def test_evaluate(model_path, datasets, dummy_integration_name):
     result = evaluate(
-        model_path=model_path,
+        model=model_path,
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_with_kv_cache(model_path, datasets, dummy_integration_name):
+    result = evaluate(
+        model=Pipeline.create(model_path=model_path, task="text-generation"),
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_without_kv_cache(
+    model_path, datasets, dummy_integration_name
+):
+    result = evaluate(
+        model=Pipeline.create(
+            model_path=model_path,
+            task="text-generation",
+            onnx_model_name="model-orig.onnx",
+        ),
         datasets=datasets,
         integration=dummy_integration_name,
     )
@@ -95,7 +120,7 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     datasets,
 ):
     assert evaluate(
-        model_path=model_path,
+        model=model_path,
         datasets=datasets,
         limit=2,
         no_cache=True,

From 4721c1fcd656a4e04b72eb3128fb121ca2297824 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:04:32 +0100
Subject: [PATCH 3/8] Update src/deepsparse/evaluation/cli.py

---
 src/deepsparse/evaluation/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index e0e16cb4ab..9c8fe3d06a 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,7 +180,7 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}")
+    _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}")
     _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
 
     _LOGGER.info(

From 124779435927ec266a18c3486780a77068c3f71a Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:06:35 +0100
Subject: [PATCH 4/8] Update
 src/deepsparse/transformers/pipelines/text_generation/pipeline.py

---
 .../transformers/pipelines/text_generation/pipeline.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index bbc0e8ba15..4a38392d76 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,6 +357,7 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+     @property
     def batch_size(self) -> int:
         return self.ops["single_engine"].batch_size
 

From 9e88f89e7ea175d05eed4bacbb86ac1abda8f3fd Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:07:31 +0100
Subject: [PATCH 5/8] Apply suggestions from code review

---
 src/deepsparse/evaluation/cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 9c8fe3d06a..6979521c7a 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,8 +180,7 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}")
-    _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
+    _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}")
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"

From fdb21c6cf093bc527c6c318af1bb0e5b96ee68e8 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 14:08:09 +0000
Subject: [PATCH 6/8] quality

---
 src/deepsparse/evaluation/cli.py                              | 4 +++-
 .../transformers/pipelines/text_generation/pipeline.py        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 6979521c7a..43eaa33790 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,7 +180,9 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}")
+    _LOGGER.info(
+        f"Creating {engine_type} pipeline to evaluate from model path: {model_path}"
+    )
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index 4a38392d76..64c0c64a51 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,7 +357,7 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
-     @property
+    @property
     def batch_size(self) -> int:
         return self.ops["single_engine"].batch_size
 

From e7d8c3127dafcbec9b380949a3fe189da77b24ba Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:35:26 +0100
Subject: [PATCH 7/8] Update test_evaluator.py

---
 tests/deepsparse/evaluation/test_evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index dedd63fa36..61a1eb3891 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -133,6 +133,6 @@ def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serializat
         standalone_mode=False,
     )
     # makes sure that the result file is created
-    assert os.path.isfile(
-        os.path.join(os.path.dirname(str(tmp_path)), f"result.{type_serialization}")
+    assert os.path.isfile(os.path.join(os.path.dirname(str(tmp_path)), 
+                                       f"result.{type_serialization}")
     )

From a9e98478ec394b673749f2c496228a4061b02281 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:54:06 +0000
Subject: [PATCH 8/8] quality

---
 src/deepsparse/evaluation/cli.py          | 2 +-
 tests/deepsparse/evaluation/test_utils.py | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 43eaa33790..b68d32d4e5 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --model_path MODEL_PATH 
+    --model_path MODEL_PATH
                         A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index a16cb8ee32..f8f3c731a8 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -14,12 +14,6 @@
 
 import os
 
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    GPTNeoForCausalLM,
-)
-
 import pytest
 from deepsparse import Pipeline
 from deepsparse.evaluation.utils import (