From 78393e782b0b458b49f197da222e0de40e738b2c Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 11 Jul 2024 14:36:34 +0200
Subject: [PATCH 1/5] first commit

---
 .../text_generation/h2o-danube/run.py         | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 natural_language_processing/text_generation/h2o-danube/run.py

diff --git a/natural_language_processing/text_generation/h2o-danube/run.py b/natural_language_processing/text_generation/h2o-danube/run.py
new file mode 100644
index 00000000..eadb7da2
--- /dev/null
+++ b/natural_language_processing/text_generation/h2o-danube/run.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024, Ampere Computing LLC
+try:
+    from utils import misc  # noqa
+except ModuleNotFoundError:
+    import os
+    import sys
+    filename = "set_env_variables.sh"
+    directory = os.path.realpath(__file__).split("/")[:-1]
+    for idx in range(1, len(directory) - 1):
+        subdir = "/".join(directory[:-idx])
+        if filename in os.listdir(subdir):
+            print(f"\nPlease run \033[91m'source {os.path.join(subdir, filename)}'\033[0m first.")
+            break
+    else:
+        print(f"\n\033[91mFAIL: Couldn't find {filename}, are you running this script as part of Ampere Model Library?"
+              f"\033[0m")
+    sys.exit(1)
+
+
+def run_pytorch_fp32(model_name, num_runs, timeout, dataset_path, **kwargs):
+    import torch
+    from transformers import pipeline
+
+    from utils.benchmark import run_model
+    from utils.pytorch import apply_compile
+    from utils.pytorch import PyTorchRunnerV2
+    from utils.nlp.alpaca_instruct import AlpacaInstruct
+
+    # model = DiffusionPipeline.from_pretrained(model_name,
+    #                                          use_safetensors=True,
+    #                                          torch_dtype=torch.bfloat16).to("cpu")
+
+    pipe = pipeline("text-generation", model="h2oai/h2o-danube2-1.8b-chat",
+                    torch_dtype=torch.bfloat16, device_map="auto")
+
+    # messages = [{"role": "user", "content": "Why is drinking water so healthy?"}]
+    # prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+    # res = pipe(prompt, max_new_tokens=256)
+
+    # print(res[0]["generated_text"])
+
+    # model.unet = apply_compile(model.unet)
+
+    def single_pass_pytorch(_runner, _dataset):
+        prompt = encode([{"role": "user", "content": _dataset.get_input_string()}])
+        # prompts = [_stablediffusion.get_input() for _ in range(batch_size)]
+        res = _runner.run(1, prompt=prompt, max_new_tokens=256)
+        print(res[0]["generated_text"])
+
+    runner = PyTorchRunnerV2(pipe)
+
+    dataset = AlpacaInstruct(1, dataset_path=dataset_path)
+    tokenizer = pipe.tokenizer.apply_chat_template
+    encode = lambda i: tokenizer(i, tokenize=False, add_generation_prompt=True)
+    return run_model(single_pass_pytorch, runner, dataset, 1, num_runs, timeout)
+
+
+if __name__ == "__main__":
+    from utils.helpers import DefaultArgParser
+
+    h2o_danube_variants = ["h2oai/h2o-danube2-1.8b-chat"]
+    parser = DefaultArgParser(["pytorch"])
+    parser.require_model_name(h2o_danube_variants)
+    parser.add_argument("--dataset_path",
+                        type=str,
+                        help="path to JSON file with instructions")
+
+    run_pytorch_fp32(**vars(parser.parse()))

From c0641ba957479a37ef28e9b888ab1d43cefa037d Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 11 Jul 2024 15:28:28 +0200
Subject: [PATCH 2/5] wip

---
 .../text_generation/h2o-danube/run.py         | 21 +++++--------------
 utils/nlp/alpaca_instruct.py                  |  4 ++++
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/natural_language_processing/text_generation/h2o-danube/run.py b/natural_language_processing/text_generation/h2o-danube/run.py
index eadb7da2..444c0178 100644
--- a/natural_language_processing/text_generation/h2o-danube/run.py
+++ b/natural_language_processing/text_generation/h2o-danube/run.py
@@ -27,27 +27,16 @@ def run_pytorch_fp32(model_name, num_runs, timeout, dataset_path, **kwargs):
     from utils.pytorch import PyTorchRunnerV2
     from utils.nlp.alpaca_instruct import AlpacaInstruct
 
-    # model = DiffusionPipeline.from_pretrained(model_name,
-    #                                          use_safetensors=True,
-    #                                          torch_dtype=torch.bfloat16).to("cpu")
-
-    pipe = pipeline("text-generation", model="h2oai/h2o-danube2-1.8b-chat",
+    pipe = pipeline("text-generation", model=model_name,
                     torch_dtype=torch.bfloat16, device_map="auto")
 
-    # messages = [{"role": "user", "content": "Why is drinking water so healthy?"}]
-    # prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-    # res = pipe(prompt, max_new_tokens=256)
-
-    # print(res[0]["generated_text"])
-
-    # model.unet = apply_compile(model.unet)
+    model = apply_compile(pipe)
 
     def single_pass_pytorch(_runner, _dataset):
         prompt = encode([{"role": "user", "content": _dataset.get_input_string()}])
-        # prompts = [_stablediffusion.get_input() for _ in range(batch_size)]
-        res = _runner.run(1, prompt=prompt, max_new_tokens=256)
-        print(res[0]["generated_text"])
+        response = _runner.run(1, prompt, max_new_tokens=256)
+        _dataset.submit_prediction(response[0]["generated_text"])
+        # print(res[0]["generated_text"])
 
     runner = PyTorchRunnerV2(pipe)
 
diff --git a/utils/nlp/alpaca_instruct.py b/utils/nlp/alpaca_instruct.py
index c84d3b2e..ff6a9f3d 100644
--- a/utils/nlp/alpaca_instruct.py
+++ b/utils/nlp/alpaca_instruct.py
@@ -31,6 +31,10 @@ def __init__(self, batch_size: int, dataset_path=None):
 
     def get_input_string(self):
         self._current_sample += 1
+        print('====')
+        print(self._current_sample)
+        print(self._batch_size)
+        print('====')
         assert self._current_sample * self._batch_size == self._count
 
         prompt = ("Below is an instruction that describes a task. "

From e777d6245265a4b2eff4eb0a9d1e8d4d2655b226 Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 11 Jul 2024 15:31:12 +0200
Subject: [PATCH 3/5] wip

---
 natural_language_processing/text_generation/h2o-danube/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/natural_language_processing/text_generation/h2o-danube/run.py b/natural_language_processing/text_generation/h2o-danube/run.py
index 444c0178..cf5c5900 100644
--- a/natural_language_processing/text_generation/h2o-danube/run.py
+++ b/natural_language_processing/text_generation/h2o-danube/run.py
@@ -30,7 +30,7 @@ def run_pytorch_fp32(model_name, num_runs, timeout, dataset_path, **kwargs):
     pipe = pipeline("text-generation", model=model_name,
                     torch_dtype=torch.bfloat16, device_map="auto")
 
-    model = apply_compile(pipe)
+    pipe.model = apply_compile(pipe.model)
 
     def single_pass_pytorch(_runner, _dataset):
         prompt = encode([{"role": "user", "content": _dataset.get_input_string()}])

From a0cc3982048ddde42205739b118c413d5f6ae69c Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 11 Jul 2024 15:32:42 +0200
Subject: [PATCH 4/5] wip

---
 utils/nlp/alpaca_instruct.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/nlp/alpaca_instruct.py b/utils/nlp/alpaca_instruct.py
index ff6a9f3d..73bc8c7e 100644
--- a/utils/nlp/alpaca_instruct.py
+++ b/utils/nlp/alpaca_instruct.py
@@ -34,6 +34,8 @@ def get_input_string(self):
         print('====')
         print(self._current_sample)
         print(self._batch_size)
+        print(self._count)
+        print(self._current_sample * self._batch_size)
         print('====')
         assert self._current_sample * self._batch_size == self._count
 

From 8a60ea2eb42e90d6e6b50065989467dcc098ce3d Mon Sep 17 00:00:00 2001
From: Marcel Wilnicki <marcel.wilnicki@gmail.com>
Date: Thu, 11 Jul 2024 16:13:25 +0200
Subject: [PATCH 5/5] wip

---
 utils/nlp/alpaca_instruct.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/utils/nlp/alpaca_instruct.py b/utils/nlp/alpaca_instruct.py
index 73bc8c7e..c84d3b2e 100644
--- a/utils/nlp/alpaca_instruct.py
+++ b/utils/nlp/alpaca_instruct.py
@@ -31,12 +31,6 @@ def __init__(self, batch_size: int, dataset_path=None):
 
     def get_input_string(self):
         self._current_sample += 1
-        print('====')
-        print(self._current_sample)
-        print(self._batch_size)
-        print(self._count)
-        print(self._current_sample * self._batch_size)
-        print('====')
         assert self._current_sample * self._batch_size == self._count
 
         prompt = ("Below is an instruction that describes a task. "