gretelai
diff --git a/‎config_templates/gretel/synthetics/navigator-ft-differential-privacy.yml‎
Lines changed: 1 addition & 1 deletion b/‎config_templates/gretel/synthetics/navigator-ft-differential-privacy.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_templates/gretel/synthetics/navigator-ft.yml‎
Lines changed: 11 additions & 2 deletions b/‎config_templates/gretel/synthetics/navigator-ft.yml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎config_templates/gretel/tasks/tabular_ft__default.yaml‎
Lines changed: 31 additions & 24 deletions b/‎config_templates/gretel/tasks/tabular_ft__default.yaml‎
Lines changed: 31 additions & 24 deletions
diff --git a/‎config_templates/gretel/tasks/tabular_ft__differential_privacy.yaml‎
Lines changed: 50 additions & 43 deletions b/‎config_templates/gretel/tasks/tabular_ft__differential_privacy.yaml‎
Lines changed: 50 additions & 43 deletions
diff --git a/‎config_templates/gretel/tasks/tabular_gan__default.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config_templates/gretel/tasks/tabular_gan__default.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_templates/gretel/tasks/text_ft__default.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config_templates/gretel/tasks/text_ft__default.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_templates/gretel/tasks/text_ft__differential_privacy.yaml‎
Lines changed: 2 additions & 2 deletions b/‎config_templates/gretel/tasks/text_ft__differential_privacy.yaml‎
Lines changed: 2 additions & 2 deletions
@@ -17,7 +17,7 @@ models:
     order_training_examples_by: null
 
     generate:
-        num_records: 5000
+        num_records: 1000
 
         # With DP, enabling structured generation can help with
         # increasing the percentage of valid records.
 
@@ -17,7 +17,7 @@ models:
       order_training_examples_by: null
 
       generate:
-        num_records: 5000
+        num_records: 1000
 
       params:
         # The parameter below is a proxy for training time.
@@ -28,4 +28,13 @@ models:
         # (we downsample), larger (we resample), or the same
         # size as your input dataset. A starting value to
         # experiment with is 25,000.
-        num_input_records_to_sample: auto
+        num_input_records_to_sample: auto
+
+        # Scale the base LLM's context length by this factor
+        # using RoPE scaling to handle datasets with more
+        # columns, or datasets containing groups with more
+        # than a few records. You can try increasing the
+        # rope_scaling_factor (you could first try the value 2)
+        # if you hit an error for maximum tokens. It must be
+        # an integer value. The default is 1 and maximum is 6. 
+        rope_scaling_factor: 1
@@ -3,31 +3,38 @@ name: default
 task:
   name: tabular_ft
   config:
-      train:
-        # Optionally group records by the column(s) set below.
-        # This is useful if you need to maintain correlations
-        # across multiple records. Otherwise, the training
-        # assumes records are independent.
-        group_training_examples_by: null
+    train:
+      # Optionally group records by the column(s) set below.
+      # This is useful if you need to maintain correlations
+      # across multiple records. Otherwise, the training
+      # assumes records are independent.
+      group_training_examples_by: null
 
-        # Optionally order records by the column set below.
-        # This is useful if your records are sequential.
-        # Note that this parameter can only be used when
-        # your records are grouped using the above parameter.
-        order_training_examples_by: null
+      # Optionally order records by the column set below.
+      # This is useful if your records are sequential.
+      # Note that this parameter can only be used when
+      # your records are grouped using the above parameter.
+      order_training_examples_by: null
 
-        params:
-          # The parameter below is a proxy for training time.
-          # If set to 'auto', we will automatically choose an
-          # appropriate value. An integer value will set the
-          # number of records from the input dataset that the
-          # model will see during training. It can be smaller
-          # (we downsample), larger (we resample), or the same
-          # size as your input dataset. A starting value to
-          # experiment with is 25,000.
-          num_input_records_to_sample: auto
-
-      generate:
-        num_records: 5000
+      params:
+        # The parameter below is a proxy for training time.
+        # If set to 'auto', we will automatically choose an
+        # appropriate value. An integer value will set the
+        # number of records from the input dataset that the
+        # model will see during training. It can be smaller
+        # (we downsample), larger (we resample), or the same
+        # size as your input dataset. A starting value to
+        # experiment with is 25,000.
+        num_input_records_to_sample: auto
 
+        # Scale the base LLM's context length by this factor
+        # using RoPE scaling to handle datasets with more
+        # columns, or datasets containing groups with more
+        # than a few records. You can try increasing the
+        # rope_scaling_factor (you could first try the value 2)
+        # if you hit an error for maximum tokens. It must be
+        # an integer value. The default is 1 and maximum is 6. 
+        rope_scaling_factor: 1
 
+    generate:
+      num_records: 1000
@@ -3,46 +3,53 @@ name: differential_privacy
 task:
   name: tabular_ft
   config:
-      train:
-        # Optionally group records by the column(s) set below.
-        # This is useful if you need to maintain correlations
-        # across multiple records. Otherwise, the training
-        # assumes records are independent.
-        group_training_examples_by: null
-
-        # Optionally order records by the column set below.
-        # This is useful if your records are sequential.
-        # Note that this parameter can only be used when
-        # your records are grouped using the above parameter.
-        order_training_examples_by: null
-
-        privacy_params:
-            dp: true
-
-            # Defines the privacy budget - the larger the value, the
-            # less privacy we get. A value between 2 and 8 is deemed
-            # reasonable, usually.
-            epsilon: 8
-
-        params:
-          # The parameter below is a proxy for training time.
-          # If set to 'auto', we will automatically choose an
-          # appropriate value. An integer value will set the
-          # number of records from the input dataset that the
-          # model will see during training. It can be smaller
-          # (we downsample), larger (we resample), or the same
-          # size as your input dataset. A starting value to
-          # experiment with is 25,000.
-          num_input_records_to_sample: auto
-
-          # You can try increasing this until you run out-of-memory.
-          batch_size: 4
-
-      generate:
-        num_records: 5000
-
-        # With DP, enabling structured generation can help with
-        # increasing the percentage of valid records.
-        use_structured_generation: true
-
-
+    train:
+      # Optionally group records by the column(s) set below.
+      # This is useful if you need to maintain correlations
+      # across multiple records. Otherwise, the training
+      # assumes records are independent.
+      group_training_examples_by: null
+
+      # Optionally order records by the column set below.
+      # This is useful if your records are sequential.
+      # Note that this parameter can only be used when
+      # your records are grouped using the above parameter.
+      order_training_examples_by: null
+
+      privacy_params:
+        dp: true
+
+        # Defines the privacy budget - the larger the value, the
+        # less privacy we get. A value between 2 and 8 is deemed
+        # reasonable, usually.
+        epsilon: 8
+
+      params:
+        # The parameter below is a proxy for training time.
+        # If set to 'auto', we will automatically choose an
+        # appropriate value. An integer value will set the
+        # number of records from the input dataset that the
+        # model will see during training. It can be smaller
+        # (we downsample), larger (we resample), or the same
+        # size as your input dataset. A starting value to
+        # experiment with is 25,000.
+        num_input_records_to_sample: auto
+
+        # Scale the base LLM's context length by this factor
+        # using RoPE scaling to handle datasets with more
+        # columns, or datasets containing groups with more
+        # than a few records. You can try increasing the
+        # rope_scaling_factor (you could first try the value 2)
+        # if you hit an error for maximum tokens. It must be
+        # an integer value. The default is 1 and maximum is 6. 
+        rope_scaling_factor: 1
+
+        # You can try increasing this until you run out-of-memory.
+        batch_size: 4
+
+    generate:
+      num_records: 1000
+
+      # With DP, enabling structured generation can help with
+      # increasing the percentage of valid records.
+      use_structured_generation: true
@@ -13,4 +13,4 @@ task:
         batch_size: auto
         auto_transform_datetimes: False
     generate:
-      num_records: 5000
+      num_records: 1000
@@ -13,5 +13,5 @@ task:
         lr_scheduler: "linear"
         learning_rate: 0.0001
     generate:
-      num_records: 80
+      num_records: 1000
       maximum_text_length: 100
@@ -1,5 +1,5 @@
 schema_version: "1.0"
-name: default
+name: differential_privacy
 task:
   name: text_ft
   config:
@@ -23,5 +23,5 @@ task:
         epsilon: 5  # Privacy budget (lower values = stronger privacy)
         delta: auto  # Probability of privacy leakage (auto-calculated)
     generate:
-      num_records: 80  # Number of records to generate
+      num_records: 1000  # Number of records to generate
       maximum_text_length: 128  # Maximum length of generated texts in tokens