Skip to content

Commit bb8dab0

Browse files
nina-xuHenriqueTolentino
authored andcommitted
nit fixes to workflow blueprints (#585)
* nit: make spacing consistent * fix num_records for one blueprint * change hyphen to underscore in task names
1 parent a9d5a33 commit bb8dab0

14 files changed

+358
-363
lines changed

config_templates/gretel/tasks/tabular_ft__default.yaml

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,40 +3,38 @@ name: default
33
task:
44
name: tabular_ft
55
config:
6-
train:
7-
# Optionally group records by the column(s) set below.
8-
# This is useful if you need to maintain correlations
9-
# across multiple records. Otherwise, the training
10-
# assumes records are independent.
11-
group_training_examples_by: null
6+
train:
7+
# Optionally group records by the column(s) set below.
8+
# This is useful if you need to maintain correlations
9+
# across multiple records. Otherwise, the training
10+
# assumes records are independent.
11+
group_training_examples_by: null
1212

13-
# Optionally order records by the column set below.
14-
# This is useful if your records are sequential.
15-
# Note that this parameter can only be used when
16-
# your records are grouped using the above parameter.
17-
order_training_examples_by: null
13+
# Optionally order records by the column set below.
14+
# This is useful if your records are sequential.
15+
# Note that this parameter can only be used when
16+
# your records are grouped using the above parameter.
17+
order_training_examples_by: null
1818

19-
params:
20-
# The parameter below is a proxy for training time.
21-
# If set to 'auto', we will automatically choose an
22-
# appropriate value. An integer value will set the
23-
# number of records from the input dataset that the
24-
# model will see during training. It can be smaller
25-
# (we downsample), larger (we resample), or the same
26-
# size as your input dataset. A starting value to
27-
# experiment with is 25,000.
28-
num_input_records_to_sample: auto
29-
30-
# Scale the base LLM's context length by this factor
31-
# using RoPE scaling to handle datasets with more
32-
# columns, or datasets containing groups with more
33-
# than a few records. You can try increasing the
34-
# rope_scaling_factor (you could first try the value 2)
35-
# if you hit an error for maximum tokens. It must be
36-
# an integer value. The default is 1 and maximum is 6.
37-
rope_scaling_factor: 1
38-
39-
generate:
40-
num_records: 1000
19+
params:
20+
# The parameter below is a proxy for training time.
21+
# If set to 'auto', we will automatically choose an
22+
# appropriate value. An integer value will set the
23+
# number of records from the input dataset that the
24+
# model will see during training. It can be smaller
25+
# (we downsample), larger (we resample), or the same
26+
# size as your input dataset. A starting value to
27+
# experiment with is 25,000.
28+
num_input_records_to_sample: auto
4129

30+
# Scale the base LLM's context length by this factor
31+
# using RoPE scaling to handle datasets with more
32+
# columns, or datasets containing groups with more
33+
# than a few records. You can try increasing the
34+
# rope_scaling_factor (you could first try the value 2)
35+
# if you hit an error for maximum tokens. It must be
36+
# an integer value. The default is 1 and maximum is 6.
37+
rope_scaling_factor: 1
4238

39+
generate:
40+
num_records: 1000

config_templates/gretel/tasks/tabular_ft__differential_privacy.yaml

Lines changed: 50 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3,55 +3,53 @@ name: differential_privacy
33
task:
44
name: tabular_ft
55
config:
6-
train:
7-
# Optionally group records by the column(s) set below.
8-
# This is useful if you need to maintain correlations
9-
# across multiple records. Otherwise, the training
10-
# assumes records are independent.
11-
group_training_examples_by: null
12-
13-
# Optionally order records by the column set below.
14-
# This is useful if your records are sequential.
15-
# Note that this parameter can only be used when
16-
# your records are grouped using the above parameter.
17-
order_training_examples_by: null
18-
19-
privacy_params:
20-
dp: true
21-
22-
# Defines the privacy budget - the larger the value, the
23-
# less privacy we get. A value between 2 and 8 is deemed
24-
# reasonable, usually.
25-
epsilon: 8
26-
27-
params:
28-
# The parameter below is a proxy for training time.
29-
# If set to 'auto', we will automatically choose an
30-
# appropriate value. An integer value will set the
31-
# number of records from the input dataset that the
32-
# model will see during training. It can be smaller
33-
# (we downsample), larger (we resample), or the same
34-
# size as your input dataset. A starting value to
35-
# experiment with is 25,000.
36-
num_input_records_to_sample: auto
37-
38-
# Scale the base LLM's context length by this factor
39-
# using RoPE scaling to handle datasets with more
40-
# columns, or datasets containing groups with more
41-
# than a few records. You can try increasing the
42-
# rope_scaling_factor (you could first try the value 2)
43-
# if you hit an error for maximum tokens. It must be
44-
# an integer value. The default is 1 and maximum is 6.
45-
rope_scaling_factor: 1
46-
47-
# You can try increasing this until you run out-of-memory.
48-
batch_size: 4
49-
50-
generate:
51-
num_records: 1000
52-
53-
# With DP, enabling structured generation can help with
54-
# increasing the percentage of valid records.
55-
use_structured_generation: true
56-
57-
6+
train:
7+
# Optionally group records by the column(s) set below.
8+
# This is useful if you need to maintain correlations
9+
# across multiple records. Otherwise, the training
10+
# assumes records are independent.
11+
group_training_examples_by: null
12+
13+
# Optionally order records by the column set below.
14+
# This is useful if your records are sequential.
15+
# Note that this parameter can only be used when
16+
# your records are grouped using the above parameter.
17+
order_training_examples_by: null
18+
19+
privacy_params:
20+
dp: true
21+
22+
# Defines the privacy budget - the larger the value, the
23+
# less privacy we get. A value between 2 and 8 is deemed
24+
# reasonable, usually.
25+
epsilon: 8
26+
27+
params:
28+
# The parameter below is a proxy for training time.
29+
# If set to 'auto', we will automatically choose an
30+
# appropriate value. An integer value will set the
31+
# number of records from the input dataset that the
32+
# model will see during training. It can be smaller
33+
# (we downsample), larger (we resample), or the same
34+
# size as your input dataset. A starting value to
35+
# experiment with is 25,000.
36+
num_input_records_to_sample: auto
37+
38+
# Scale the base LLM's context length by this factor
39+
# using RoPE scaling to handle datasets with more
40+
# columns, or datasets containing groups with more
41+
# than a few records. You can try increasing the
42+
# rope_scaling_factor (you could first try the value 2)
43+
# if you hit an error for maximum tokens. It must be
44+
# an integer value. The default is 1 and maximum is 6.
45+
rope_scaling_factor: 1
46+
47+
# You can try increasing this until you run out-of-memory.
48+
batch_size: 4
49+
50+
generate:
51+
num_records: 1000
52+
53+
# With DP, enabling structured generation can help with
54+
# increasing the percentage of valid records.
55+
use_structured_generation: true

0 commit comments

Comments
 (0)