Merge pull request #21 from semiotic-ai/random

denverbaumgartner · web-flow · commit 75a449fc7c63 · 2025-03-10T18:59:51.000-05:00
Random
diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml
@@ -25,6 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
+  seed: 42                                              # The seed for the random number generator
 
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: quality                             # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: doc_quality                                   # Which prompt signature to use
   class: DocQualityPrompt                               # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/graphdoc/config.py b/graphdoc/graphdoc/config.py
@@ -29,10 +29,6 @@
 # logging
 log = logging.getLogger(__name__)
 
-# global variables
-random.seed(42)
-
-
 #######################
 # Resource Setup      #
 #######################
@@ -160,7 +156,9 @@ def trainset_from_yaml(yaml_path: Union[str, Path]) -> List[dspy.Example]:
 
 
 def split_trainset(
-    trainset: List[dspy.Example], evalset_ratio: float
+    trainset: List[dspy.Example],
+    evalset_ratio: float,
+    seed: int = 42,
 ) -> tuple[List[dspy.Example], List[dspy.Example]]:
     """Split a trainset into a trainset and evalset.
 
@@ -170,6 +168,7 @@ def split_trainset(
     tuple[List[dspy.Example], List[dspy.Example]]
 
     """
+    random.seed(seed)
     split_idx = int(len(trainset) * (1 - evalset_ratio))
     random.shuffle(trainset)
     evalset = trainset[split_idx:]
@@ -201,6 +200,7 @@ def trainset_and_evalset_from_yaml(
         evalset_ratio: 0.1,                     # The proportionate size of evalset
         data_helper_type: quality               # Type of data helper to use
                                                 # (quality, generation)
+        seed: 42                                # The seed for the random number generator
 
     :param yaml_path: Path to the YAML file.
     :type yaml_path: Union[str, Path]
@@ -210,7 +210,9 @@ def trainset_and_evalset_from_yaml(
     """
     config = load_yaml_config(yaml_path)
     trainset = trainset_from_dict(config["data"])
-    return split_trainset(trainset, config["data"]["evalset_ratio"])
+    return split_trainset(
+        trainset, config["data"]["evalset_ratio"], config["data"]["seed"]
+    )
 
 
 #######################
diff --git a/graphdoc/graphdoc/main.py b/graphdoc/graphdoc/main.py
@@ -3,7 +3,6 @@
 
 import argparse
 import logging
-import random
 
 # system packages
 import sys
@@ -20,9 +19,6 @@
 # logging
 log = logging.getLogger(__name__)
 
-# global variables
-random.seed(42)
-
 #######################
 # Main Entry Point    #
 #######################
diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 10                                     # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml
@@ -25,7 +25,7 @@ data:
   trainset_size: 1000                                   # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: generation                          # Type of data helper to use (quality, generation)
-
+  seed: 42                                              # The seed for the random number generator
 prompt:
   prompt: base_doc_gen                                  # Which prompt signature to use
   class: DocGeneratorPrompt                             # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml
@@ -25,6 +25,7 @@ data:
   trainset_size: 1000                                   # The size of the trainset
   evalset_ratio: 0.1                                    # The proportionate size of the evalset
   data_helper_type: quality                             # Type of data helper to use (quality, generation)
+  seed: 42                                              # The seed for the random number generator
 
 prompt:
   prompt: doc_quality                                   # Which prompt signature to use
diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml
@@ -15,6 +15,7 @@ data:
   load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
   local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
   local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
+  seed: 42 # The seed for the random number generator
 
 prompt:
   prompt: zero_shot_doc_gen # Which prompt signature to use
diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml
@@ -15,6 +15,7 @@ data:
   load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
   local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
   local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
+  seed: 42 # The seed for the random number generator
 
 prompt:
   prompt: zero_shot_doc_gen # Which prompt signature to use
diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml
@@ -15,7 +15,7 @@ data:
   load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
   local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
   local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
-
+  seed: 42 # The seed for the random number generator
 prompt:
   prompt: doc_quality # Which prompt signature to use
   class: SchemaDocQualityPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml
@@ -15,6 +15,7 @@ data:
   load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
   local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
   local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
+  seed: 42 # The seed for the random number generator
 
 prompt:
   prompt: doc_quality # Which prompt signature to use
diff --git a/graphdoc/tests/assets/configs/single_prompt_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_trainer.yaml
@@ -15,6 +15,7 @@ data:
   load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
   local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
   local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
+  seed: 42 # The seed for the random number generator
 
 prompt:
   prompt: doc_quality # Which prompt signature to use