semiotic-ai · denverbaumgartner · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
@@ -1,5 +1,5 @@
 OPENAI_API_KEY=<your-openai-api-key>
-HF_DATASET_KEY=<your-huggingface-dataset-key>
+HF_DATASET_KEY=<can-be-empty-if-you-do-not-want-to-use-huggingface>
 MLFLOW_TRACKING_URI=<your-mlflow-tracking-uri>
 MLFLOW_TRACKING_USERNAME=admin
 MLFLOW_TRACKING_PASSWORD=password
@@ -11,11 +11,13 @@ We utilize [poetry](https://python-poetry.org/) for dependency management. Pleas
 
 We utilize [commitizen](https://commitizen-tools.github.io/commitizen/) for commit messages and semantic versioning. Please run `cz commit` to commit your changes. Commitizen can be installed with `pip install commitizen` or `brew install commitizen`.
 
+We utilize [docker](https://www.docker.com/) for managing the tracking of our service and associated expirements through [mlflow](https://mlflow.org/). In our docker image, we spin up a [mlflow](https://mlflow.org/), [postgres](https://www.postgresql.org/), and [minio](https://min.io/) instance. This is very similar to our production setup, and allows for a pretty smooth development flow between local and prod. Please ensure you have downloaded and are running docker in the background of your machine. 
+
 Here are some quick commands for getting started:
 
 ```bash 
-brew add poetry
-brew add commitizen
+brew install poetry
+brew install commitizen
 ```
 
 ```bash 
@@ -26,11 +28,35 @@ cd ../mlflow-manager
 poetry install 
 ```
 
+### .env
+
+There are two `.env` files that we expect the user to set up. They are divided between `mlflow-manager` and `graphdoc`. First, let's setup the `mlflow-manager` `.env` file. You can leave these values as they are, or modify them as you see fit:
+
+```bash
+# navigate to the docker root
+cd mlflow-manager
+cd docker
+
+# copy the .env.example for setup
+cp .env.example .env # set values directly in your newly created .env file 
+```
+
+Next, let's set up the `.env` file to be used by our `graphdoc` program. 
+
+```bash
+# navigate to the graphdoc root 
+cd ../..
+
+# copy the .env.example for setup
+cp .env.example .env # set values directly in your newly created .env file 
+```
+
 ### run.sh
 
 The `run.sh` script is a convenience script for development. It provides a few shortcuts for running useful commands.
 
 ```bash 
+# make sure you are in the root of the repository 
 # ensure that the script is executable
 chmod +x run.sh
 
@@ -41,6 +67,8 @@ chmod +x run.sh
 To setup the mlflow-manager services, run the following command:
 
 ```bash 
+# default username: admin
+# default password: password
 ./run.sh mlflow-setup
 ```
 

@@ -1,8 +1,5 @@
 graphdoc: 
   log_level: INFO                                       # The log level to use (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
-  mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
-  mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
 mlflow: 
   mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
@@ -26,6 +23,7 @@ data:
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
   seed: 42                                              # The seed for the random number generator
+
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
@@ -50,18 +48,15 @@ prompt_metric:
 trainer: 
   class: DocGeneratorTrainer                            # The type of trainer to use (DocQualityTrainer)
   optimizer_type: miprov2                               # The type of optimizer to use (miprov2, BootstrapFewShotWithRandomSearch)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
   mlflow_model_name: doc_generator_model                # The name of the model in MLflow
   mlflow_experiment_name: doc_generator_experiment      # The name of the experiment in MLflow
 
 optimizer: 
   optimizer_type: miprov2                               # BootstrapFewShotWithRandomSearch, miprov2
   auto: light                                           # miprov2 setting
-  # student: this is the prompt.infer object
-  # trainset: this is the dataset we are working with 
-  max_labeled_demos: 2
-  max_bootstrapped_demos: 4
-  num_trials: 2
+  max_labeled_demos: 2                                  # max number of labeled demonstrations
+  max_bootstrapped_demos: 4                             # max number of bootstrapped demonstrations
+  num_trials: 2                                         # number of trials
   minibatch: true                                       # default true
 
 module: 

@@ -1,8 +1,5 @@
 graphdoc: 
   log_level: INFO                                       # The log level to use (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
-  mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
-  mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
 mlflow: 
   mlflow_tracking_uri: !env MLFLOW_TRACKING_URI           # The tracking URI for MLflow
@@ -51,18 +48,15 @@ prompt_metric:
 trainer: 
   class: DocGeneratorTrainer                            # The type of trainer to use (DocQualityTrainer)
   optimizer_type: miprov2                               # The type of optimizer to use (miprov2, BootstrapFewShotWithRandomSearch)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
   mlflow_model_name: doc_generator_model                # The name of the model in MLflow
   mlflow_experiment_name: doc_generator_experiment      # The name of the experiment in MLflow
 
 optimizer: 
   optimizer_type: miprov2                               # BootstrapFewShotWithRandomSearch, miprov2
   auto: light                                           # miprov2 setting
-  # student: this is the prompt.infer object
-  # trainset: this is the dataset we are working with 
-  max_labeled_demos: 2
-  max_bootstrapped_demos: 4
-  num_trials: 2
+  max_labeled_demos: 2                                  # max number of labeled demonstrations
+  max_bootstrapped_demos: 4                             # max number of bootstrapped demonstrations
+  num_trials: 2                                         # number of trials
   minibatch: true                                       # default true
 
 module: 
@@ -72,7 +66,6 @@ module:
   fill_empty_descriptions: true                         # Whether to fill the empty descriptions in the schema
 
 eval:
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
   mlflow_experiment_name: doc_generator_eval            # The name of the experiment in MLflow
   generator_prediction_field: documented_schema         # The field in the generator prediction to use
   evaluator_prediction_field: rating                    # The field in the evaluator prediction to use

@@ -1,17 +1,14 @@
 graphdoc: 
-  log_level: INFO                                        # The log level to use (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
-  mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
-  mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
+  log_level: INFO                                         # The log level to use (DEBUG, INFO, WARNING, ERROR, CRITICAL)
 
 mlflow: 
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
+  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI           # The tracking URI for MLflow
   mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
   mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
 language_model: 
-  model: openai/gpt-4o                          # Must be a valid dspy language model
-  api_key: !env OPENAI_API_KEY                       # Must be a valid dspy language model API key
+  model: openai/gpt-4o                                  # Must be a valid dspy language model
+  api_key: !env OPENAI_API_KEY                          # Must be a valid dspy language model API key
   cache: true                                           # Whether to cache the calls to the language model
 
 data: 
@@ -26,6 +23,7 @@ data:
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
   seed: 42                                              # The seed for the random number generator
+
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
@@ -50,16 +48,13 @@ prompt_metric:
 trainer: 
   class: DocGeneratorTrainer                            # The type of trainer to use (DocQualityTrainer)
   optimizer_type: miprov2                               # The type of optimizer to use (miprov2, BootstrapFewShotWithRandomSearch)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
   mlflow_model_name: doc_generator_model                # The name of the model in MLflow
   mlflow_experiment_name: doc_generator_experiment      # The name of the experiment in MLflow
 
 optimizer: 
   optimizer_type: miprov2                               # BootstrapFewShotWithRandomSearch, miprov2
   auto: light                                           # miprov2 setting
-  # student: this is the prompt.infer object
-  # trainset: this is the dataset we are working with 
-  max_labeled_demos: 2
-  max_bootstrapped_demos: 4
-  num_trials: 2
+  max_labeled_demos: 2                                  # max number of labeled demonstrations
+  max_bootstrapped_demos: 4                             # max number of bootstrapped demonstrations
+  num_trials: 2                                         # number of trials
   minibatch: true                                       # default true
@@ -1,8 +1,5 @@
 graphdoc: 
   log_level: INFO                                       # The log level to use (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
-  mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
-  mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
 mlflow: 
   mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
@@ -26,6 +23,7 @@ data:
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: quality                             # Type of data helper to use (quality, generation)
   seed: 42                                              # The seed for the random number generator
+
 prompt:
   prompt: doc_quality                                   # Which prompt signature to use
   class: DocQualityPrompt                               # Must be a child of SinglePrompt (we will use an enum to map this)
@@ -50,16 +48,13 @@ prompt_metric:
 trainer: 
   class: DocQualityTrainer                              # The type of trainer to use (DocQualityTrainer)
   optimizer_type: miprov2                               # The type of optimizer to use (miprov2, BootstrapFewShotWithRandomSearch)
-  mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
   mlflow_model_name: doc_quality_model                  # The name of the model in MLflow
   mlflow_experiment_name: doc_quality_experiment        # The name of the experiment in MLflow
 
 optimizer: 
   optimizer_type: miprov2                               # BootstrapFewShotWithRandomSearch, miprov2
   auto: light                                           # miprov2 setting
-  # student: this is the prompt.infer object
-  # trainset: this is the dataset we are working with 
-  max_labeled_demos: 2
-  max_bootstrapped_demos: 4
-  num_trials: 2
+  max_labeled_demos: 2                                  # max number of labeled demonstrations
+  max_bootstrapped_demos: 4                             # max number of bootstrapped demonstrations
+  num_trials: 2                                         # number of trials
   minibatch: true                                       # default true
@@ -0,0 +1,8 @@
+graphdoc.modules.token\_tracker module
+======================================
+
+.. automodule:: graphdoc.modules.token_tracker
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :noindex:
@@ -26,6 +26,5 @@ Indices and tables
 ==================
 
 * :ref:`genindex`
-* :ref:`modindex`
 * :ref:`search`
 
@@ -116,7 +116,7 @@ def mlflow_data_helper_from_yaml(yaml_path: Union[str, Path]) -> MlflowDataHelpe
     .. code-block:: yaml
 
         mlflow:
-            mlflow_tracking_uri: !env MLFLOW_TRACKING_URI           # The tracking URI for MLflow
+            mlflow_tracking_uri:      !env MLFLOW_TRACKING_URI      # The tracking URI for MLflow
             mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
             mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
@@ -429,11 +429,15 @@ def single_trainer_from_dict(
     .. code-block:: python
 
         {
+            "mlflow": {
+                "mlflow_tracking_uri": "http://localhost:5000",
+                "mlflow_tracking_username": "admin",
+                "mlflow_tracking_password": "password",
+            },
             "trainer": {
                 "class": "DocQualityTrainer",
                 "mlflow_model_name": "doc_quality_model",
                 "mlflow_experiment_name": "doc_quality_experiment",
-                "mlflow_tracking_uri": "http://localhost:5000"
             },
             "optimizer": {
                 "optimizer_type": "miprov2",
@@ -465,7 +469,7 @@ def single_trainer_from_dict(
             optimizer_kwargs=trainer_dict["optimizer"],
             mlflow_model_name=trainer_dict["trainer"]["mlflow_model_name"],
             mlflow_experiment_name=trainer_dict["trainer"]["mlflow_experiment_name"],
-            mlflow_tracking_uri=trainer_dict["trainer"]["mlflow_tracking_uri"],
+            mlflow_tracking_uri=trainer_dict["mlflow"]["mlflow_tracking_uri"],
             trainset=trainset,
             evalset=evalset,
         )
@@ -631,7 +635,7 @@ def doc_generator_eval_from_yaml(yaml_path: Union[str, Path]) -> DocGeneratorEva
     .. code-block:: yaml
 
         mlflow:
-            mlflow_tracking_uri: !env MLFLOW_TRACKING_URI           # The tracking URI for MLflow
+            mlflow_tracking_uri:      !env MLFLOW_TRACKING_URI      # The tracking URI for MLflow
             mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server
             mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server
 
@@ -663,7 +667,6 @@ def doc_generator_eval_from_yaml(yaml_path: Union[str, Path]) -> DocGeneratorEva
             fill_empty_descriptions: true                         # Whether to fill the empty descriptions in the schema
 
         eval:
-            mlflow_tracking_uri: !env MLFLOW_TRACKING_URI         # The tracking URI for MLflow
             mlflow_experiment_name: doc_generator_eval            # The name of the experiment in MLflow
             generator_prediction_field: documented_schema         # The field in the generator prediction to use
             evaluator_prediction_field: rating                    # The field in the evaluator prediction to use

@@ -30,9 +30,9 @@ class DocQualitySignature(dspy.Signature):
     """  # noqa: B950
 
     database_schema: str = dspy.InputField()
-    category: Literal[
-        "perfect", "almost perfect", "poor but correct", "incorrect"
-    ] = dspy.OutputField()
+    category: Literal["perfect", "almost perfect", "poor but correct", "incorrect"] = (
+        dspy.OutputField()
+    )
     rating: Literal[4, 3, 2, 1] = dspy.OutputField()
 
 
@@ -69,9 +69,9 @@ class DocQualityDemonstrationSignature(dspy.Signature):
     """  # noqa: B950
 
     database_schema: str = dspy.InputField()
-    category: Literal[
-        "perfect", "almost perfect", "poor but correct", "incorrect"
-    ] = dspy.OutputField()
+    category: Literal["perfect", "almost perfect", "poor but correct", "incorrect"] = (
+        dspy.OutputField()
+    )
     rating: Literal[4, 3, 2, 1] = dspy.OutputField()