@@ -3,55 +3,53 @@ name: differential_privacy
33task :
44 name : tabular_ft
55 config :
6- train :
7- # Optionally group records by the column(s) set below.
8- # This is useful if you need to maintain correlations
9- # across multiple records. Otherwise, the training
10- # assumes records are independent.
11- group_training_examples_by : null
12-
13- # Optionally order records by the column set below.
14- # This is useful if your records are sequential.
15- # Note that this parameter can only be used when
16- # your records are grouped using the above parameter.
17- order_training_examples_by : null
18-
19- privacy_params :
20- dp : true
21-
22- # Defines the privacy budget - the larger the value, the
23- # less privacy we get. A value between 2 and 8 is deemed
24- # reasonable, usually.
25- epsilon : 8
26-
27- params :
28- # The parameter below is a proxy for training time.
29- # If set to 'auto', we will automatically choose an
30- # appropriate value. An integer value will set the
31- # number of records from the input dataset that the
32- # model will see during training. It can be smaller
33- # (we downsample), larger (we resample), or the same
34- # size as your input dataset. A starting value to
35- # experiment with is 25,000.
36- num_input_records_to_sample : auto
37-
38- # Scale the base LLM's context length by this factor
39- # using RoPE scaling to handle datasets with more
40- # columns, or datasets containing groups with more
41- # than a few records. You can try increasing the
42- # rope_scaling_factor (you could first try the value 2)
43- # if you hit an error for maximum tokens. It must be
44- # an integer value. The default is 1 and maximum is 6.
45- rope_scaling_factor : 1
46-
47- # You can try increasing this until you run out-of-memory.
48- batch_size : 4
49-
50- generate :
51- num_records : 1000
52-
53- # With DP, enabling structured generation can help with
54- # increasing the percentage of valid records.
55- use_structured_generation : true
56-
57-
6+ train :
7+ # Optionally group records by the column(s) set below.
8+ # This is useful if you need to maintain correlations
9+ # across multiple records. Otherwise, the training
10+ # assumes records are independent.
11+ group_training_examples_by : null
12+
13+ # Optionally order records by the column set below.
14+ # This is useful if your records are sequential.
15+ # Note that this parameter can only be used when
16+ # your records are grouped using the above parameter.
17+ order_training_examples_by : null
18+
19+ privacy_params :
20+ dp : true
21+
22+ # Defines the privacy budget - the larger the value, the
23+ # less privacy we get. A value between 2 and 8 is deemed
24+ # reasonable, usually.
25+ epsilon : 8
26+
27+ params :
28+ # The parameter below is a proxy for training time.
29+ # If set to 'auto', we will automatically choose an
30+ # appropriate value. An integer value will set the
31+ # number of records from the input dataset that the
32+ # model will see during training. It can be smaller
33+ # (we downsample), larger (we resample), or the same
34+ # size as your input dataset. A starting value to
35+ # experiment with is 25,000.
36+ num_input_records_to_sample : auto
37+
38+ # Scale the base LLM's context length by this factor
39+ # using RoPE scaling to handle datasets with more
40+ # columns, or datasets containing groups with more
41+ # than a few records. You can try increasing the
42+ # rope_scaling_factor (you could first try the value 2)
43+ # if you hit an error for maximum tokens. It must be
44+ # an integer value. The default is 1 and maximum is 6.
45+ rope_scaling_factor : 1
46+
47+ # You can try increasing this until you run out-of-memory.
48+ batch_size : 4
49+
50+ generate :
51+ num_records : 1000
52+
53+ # With DP, enabling structured generation can help with
54+ # increasing the percentage of valid records.
55+ use_structured_generation : true
0 commit comments