Merge pull request #1020 from llmware-ai/update-onnx-ov-100424

doberst · web-flow · commit 54138d415010 · 2024-10-04T00:37:33.000-07:00
adding new model classes
diff --git a/llmware/configs.py b/llmware/configs.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 llmware
+# Copyright 2023 llmware
 
 # Licensed under the Apache License, Version 2.0 (the "License"); you
 # may not use this file except in compliance with the License.  You
@@ -502,6 +502,66 @@ def add_vector_db(cls, db_name, vector_db_class, module="llmware.embeddings"):
 logging.basicConfig(format=LLMWareConfig().get_logging_format(), level=LLMWareConfig().get_logging_level())
 
 
+class OVConfig:
+
+    """ Configuration object for OpenVino - these parameters are consumed by the
+    OVGenerativeModel class in module llmware.models. In most cases, the parameters
+    do not require attention, but provided for more options for performance tuning with
+    GPU deployment in particular. """
+
+    _conf = {"device": "GPU",
+             "use_ov_tokenizer": False,
+             "generation_version": "ov_genai_pip",
+             "use_gpu_if_available": True,
+             "cache": True,
+             "cache_with_model": True,
+             "cache_custom_path": "",
+             "apply_performance_hints": True,
+             "verbose_mode": False,
+             "get_token_counts": True
+             }
+
+    _supported_hints = ["MODEL_PRIORITY", "GPU_HOST_TASK_PRIORITY",
+                        "GPU_QUEUE_THROTTLE", "GPU_QUEUE_PRIORITY"]
+
+    # this is a subset of useful GPU performance hints - will expand options over time
+
+    _gpu_hints = {
+             "MODEL_PRIORITY": "HIGH",
+             "GPU_HOST_TASK_PRIORITY": "HIGH",
+             "GPU_QUEUE_THROTTLE": "HIGH",
+             "GPU_QUEUE_PRIORITY": "HIGH"
+             }
+
+    @classmethod
+    def get_config(cls, param):
+        return cls._conf[param]
+
+    @classmethod
+    def set_config(cls, param, value):
+        cls._conf[param]= value
+
+    @classmethod
+    def get_gpu_hints(cls):
+        return cls._gpu_hints
+
+    @classmethod
+    def set_gpu_hint(cls, param, value):
+
+        #   will add safety checks for type - most in form of "HIGH" | "MEDIUM" | "LOW"
+        #   for more information, please see OpenVino documentation
+        if param in cls._supported_hints:
+            cls._gpu_hints[param] = value
+
+    @classmethod
+    def optimize_for_gpu(cls):
+        return cls._conf["use_gpu_if_available"]
+
+    @classmethod
+    def generation_version(cls):
+        return cls._conf["generation_version"]
+
+
 class MilvusConfig:
 
     """Configuration object for Milvus"""
@@ -598,6 +658,7 @@ def get_uri_string(cls):
 
         # canonical simple format of postgres uri string
         input_collection_db_path = f"postgresql://postgres@{host}:{port}/{db_name}"
+        # print("update: postgres get_uri_string - ", input_collection_db_path)
 
         return input_collection_db_path
 
@@ -658,7 +719,6 @@ def get_config(cls, name):
     def set_config(cls, name, value):
         cls._conf[name] = value
 
-
 class LanceDBConfig:
 
     _conf = {'uri': '/tmp/lancedb/'}
@@ -702,14 +762,13 @@ def get_uri_string (cls):
         db_file = os.path.join(cls._conf["sqlite_db_folder_path"], cls._conf["db_name"])
         return db_file
 
+    #   new method for SQLTables DB
     @classmethod
     def get_uri_string_experimental_db(cls):
         """For SQLite the URI string is the local file with full absolute path"""
-
-        # used in SQLTables DB in llmware.agents module
-
         db_file = os.path.join(cls._conf["sqlite_db_folder_path"], cls._conf["db_experimental"])
         return db_file
+    #   end method
 
     @classmethod
     def get_db_configs(cls):
@@ -782,8 +841,6 @@ def set_config(cls, name, value):
 
 class LLMWareTableSchema:
 
-    """ Table Schema used for Parsing, Library Cards and other llmware modules.  """
-
     #   notes:
     #   1.  bigserial type for Postgres
     #   2.  "text" and "table" replaced with "text_block" and "table_block" in SQL DB for safety / reserved
@@ -985,7 +1042,6 @@ class ChromaDBConfig:
         # update - v0.2.12 -> by default, persistent path set to make chroma persistent.
         # If this is None, then an in-memory only chroma instance will be created.
         #
-
         'persistent_path': LLMWareConfig().get_library_path(),
 
         #
diff --git a/llmware/gguf_configs.py b/llmware/gguf_configs.py
@@ -892,6 +892,11 @@ class GGUFConfigs:
                   "force_gpu": False,
                   "use_macos_accelerate": True,
 
+                  # option to capture and provide the 'first token' of generation
+                  # used for GGUF - and implemented for HFGenerative (Pytorch) and
+                  # ONNXGenerative classes as well
+                  "get_first_token_speed": False,
+
                   # prebuilt shared libraries included in llmware
                   "windows": "libllama_win.dll",
                   "windows_cuda": "libllama_win_cuda.dll",
diff --git a/llmware/model_configs.py b/llmware/model_configs.py
@@ -15,7 +15,7 @@
 
 """Global Default Configs for Models, Finetune Wrappers and Prompt Instructions Catalog.
 
-These configs generally do not need to be accessed directly, but can be viewed, accessed and modified through
+These configs generally do not need to be accessed directly, but should be viewed, accessed and modified through
 ModelCatalog and PromptCatalog classes.
 
 For customization, there is also the option in ModelCatalog to load a custom model catalog from json file, which
@@ -24,7 +24,77 @@
 
 global_model_repo_catalog_list = [
 
-    # embedding models
+    {"model_name": "bling-tiny-llama-onnx", "model_family": "ONNXGenerativeModel",
+     "model_category": "generative_local", "display_name": "llmware/bling-tiny-llama-onnx",
+     "model_location": "llmware_repo","context_window": 2048, "instruction_following": False,
+     "prompt_wrapper": "human_bot", "temperature": 0.0, "sample_default": False, "trailing_space": "",
+     "hf_repo": "llmware/bling-tiny-llama-onnx", "custom_model_files": [], "custom_model_repo": "",
+     "snapshot": True, "tokenizer_local": "tokenizer_tl.json",
+     "fetch": {"snapshot": True, "module": "llmware.models", "method": "pull_snapshot_from_hf"},
+     "validation_files": ["model.onnx", "model.onnx.data"],
+     "link": "https://huggingface.co/llmware/bling-tiny-llama-onnx"},
+
+     {"model_name": "bling-tiny-llama-ov", "model_family": "OVGenerativeModel",
+      "model_category": "generative_local", "display_name": "bling-tiny-llama-ov",
+      "model_location": "llmware_repo",
+      "context_window": 2048, "instruction_following": False, "prompt_wrapper": "human_bot",
+      "temperature": 0.0, "sample_default": False, "trailing_space": "",
+      "tokenizer_local": "tokenizer_tl.json",
+      "hf_repo": "llmware/bling-tiny-llama-ov",
+      "custom_model_files": [], "custom_model_repo": "",
+      "fetch": {"snapshot": True, "module": "llmware.models", "method": "pull_snapshot_from_hf"},
+      "validation_files": ["openvino_model.xml"],
+      "link": "https://huggingface.co/llmware/bling-tiny-llama-ov"},
+
+     {"model_name": "bling-phi-3-ov", "model_family": "OVGenerativeModel",
+        "model_category": "generative_local", "display_name": "bling-phi-3-ov",
+        "model_location": "llmware_repo",
+        "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot",
+        "temperature": 0.0, "sample_default": False, "trailing_space": "",
+        "tokenizer_local": "tokenizer_phi3.json",
+        "hf_repo": "llmware/bling-phi-3-ov",
+        "custom_model_files": [], "custom_model_repo": "",
+        "fetch": {"snapshot": True, "module": "llmware.models","method": "pull_snapshot_from_hf"},
+        "validation_files": ["openvino_model.xml"],
+        "link": "https://huggingface.co/llmware/bling-phi-3-ov"},
+
+    {"model_name": "bling-phi-3-onnx", "model_family": "ONNXGenerativeModel",
+        "model_category": "generative_local", "display_name": "bling-phi-3-onnx",
+        "model_location": "llmware_repo",
+        "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot",
+        "temperature": 0.0, "sample_default": False, "trailing_space": "",
+        "tokenizer_local": "tokenizer_phi3.json",
+        "hf_repo": "llmware/bling-phi-3-onnx",
+        "custom_model_files": [], "custom_model_repo": "",
+        "fetch": {"snapshot": True, "module": "llmware.models", "method": "pull_snapshot_from_hf"},
+        "validation_files": ["model.onnx", "model.onnx.data"],
+        "link": "https://huggingface.co/llmware/bling-phi-3-onnx"},
+
+     {"model_name": "phi-3-onnx", "model_family": "ONNXGenerativeModel",
+        "model_category": "generative_local", "display_name": "phi-3-onnx",
+        "model_location": "llmware_repo",
+        "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot",
+        "temperature": 0.0, "sample_default": False, "trailing_space": "",
+        "tokenizer_local": "tokenizer_phi3.json",
+        "hf_repo": "llmware/phi-3-onnx",
+        "custom_model_files": [], "custom_model_repo": "",
+        "fetch": {"snapshot": True, "module": "llmware.models", "method": "pull_snapshot_from_hf"},
+        "validation_files": ["model.onnx", "model.onnx.data"],
+        "link": "https://huggingface.co/llmware/phi-3-onnx"},
+
+      {"model_name": "phi-3-ov", "model_family": "OVGenerativeModel",
+        "model_category": "generative_local", "display_name": "phi-3-ov",
+        "model_location": "llmware_repo",
+        "context_window": 4096, "instruction_following": False, "prompt_wrapper": "human_bot",
+        "temperature": 0.0, "sample_default": False, "trailing_space": "",
+        "tokenizer_local": "tokenizer_phi3.json",
+        "hf_repo": "llmware/phi-3-ov",
+        "custom_model_files": [], "custom_model_repo": "",
+        "fetch": {"snapshot": True, "module": "llmware.models", "method": "pull_snapshot_from_hf"},
+        "validation_files": ["openvino_model.xml"],
+        "link": "https://huggingface.co/llmware/phi-3-ov"},
+
+ # embedding models
 
     {"model_name": "all-MiniLM-L6-v2", "display_name": "mini-lm-sbert", "model_family": "HFEmbeddingModel",
      "model_category": "embedding", "model_location": "hf_repo", "embedding_dims": 384, "context_window": 512,
diff --git a/llmware/models.py b/llmware/models.py