File tree Expand file tree Collapse file tree 2 files changed +7
-7
lines changed Expand file tree Collapse file tree 2 files changed +7
-7
lines changed Original file line number Diff line number Diff line change @@ -213,7 +213,7 @@ def get_balanced_memory(
213
213
break # only one device
214
214
215
215
module_sizes = compute_module_sizes (model , dtype = dtype , special_dtypes = special_dtypes )
216
- per_gpu = module_sizes ["" ] // (num_devices - 1 if low_zero else num_devices )
216
+ per_device = module_sizes ["" ] // (num_devices - 1 if low_zero else num_devices )
217
217
218
218
# We can't just set the memory to model_size // num_devices as it will end being too small: each GPU will get
219
219
# slightly less layers and some layers will end up offload at the end. So this function computes a buffer size to
@@ -251,7 +251,7 @@ def get_balanced_memory(
251
251
leaves = get_module_leaves (module_sizes )
252
252
mean_leaves = int (sum (module_sizes [n ] for n in leaves ) / max (len (leaves ), 1 ))
253
253
buffer = int (1.25 * max (buffer , mean_leaves ))
254
- per_gpu += buffer
254
+ per_device += buffer
255
255
256
256
# Sorted list of GPUs id (we may have some gpu ids not included in the our max_memory list - let's ignore them)
257
257
gpus_idx_list = list (
@@ -261,7 +261,7 @@ def get_balanced_memory(
261
261
)
262
262
# The last device is left with max_memory just in case the buffer is not enough.
263
263
for idx in gpus_idx_list [:- 1 ]:
264
- max_memory [idx ] = min (max_memory [0 ] if low_zero and idx == 0 else per_gpu , max_memory [idx ])
264
+ max_memory [idx ] = min (max_memory [0 ] if low_zero and idx == 0 else per_device , max_memory [idx ])
265
265
266
266
if low_zero :
267
267
min_zero = max (0 , module_sizes ["" ] - sum (max_memory [i ] for i in range (1 , num_devices )))
Original file line number Diff line number Diff line change @@ -1164,8 +1164,8 @@ def __str__(self):
1164
1164
1165
1165
# Remove deprecated arguments. That code should be removed once
1166
1166
# those deprecated arguments are removed from TrainingArguments. (TODO: v5)
1167
- del self_as_dict ["per_gpu_train_batch_size " ]
1168
- del self_as_dict ["per_gpu_eval_batch_size " ]
1167
+ del self_as_dict ["per_device_train_batch_size " ]
1168
+ del self_as_dict ["per_device_eval_batch_size " ]
1169
1169
1170
1170
self_as_dict = {k : f"<{ k .upper ()} >" if k .endswith ("_token" ) else v for k , v in self_as_dict .items ()}
1171
1171
@@ -1193,7 +1193,7 @@ def n_device(self):
1193
1193
@property
1194
1194
def train_batch_size (self ) -> int :
1195
1195
"""
1196
- The actual batch size for training (may differ from `per_gpu_train_batch_size ` in distributed training).
1196
+ The actual batch size for training (may differ from `per_device_train_batch_size ` in distributed training).
1197
1197
"""
1198
1198
per_device_batch_size = self .per_device_train_batch_size
1199
1199
train_batch_size = per_device_batch_size * max (1 , self .n_device )
@@ -1202,7 +1202,7 @@ def train_batch_size(self) -> int:
1202
1202
@property
1203
1203
def eval_batch_size (self ) -> int :
1204
1204
"""
1205
- The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size ` in distributed training).
1205
+ The actual batch size for evaluation (may differ from `per_device_eval_batch_size ` in distributed training).
1206
1206
"""
1207
1207
per_device_batch_size = self .per_device_eval_batch_size
1208
1208
eval_batch_size = per_device_batch_size * max (1 , self .n_device )
You can’t perform that action at this time.
0 commit comments