Skip to content

Commit 331148d

Browse files
authored
Enable distributed LoRA training (ml-explore#821)
1 parent 29c954f commit 331148d

File tree

2 files changed

+86
-46
lines changed

2 files changed

+86
-46
lines changed

llms/mlx_lm/tuner/trainer.py

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import mlx.core as mx
1111
import mlx.nn as nn
1212
import numpy as np
13+
from mlx.nn.utils import average_gradients
1314
from mlx.utils import tree_flatten
1415

1516

@@ -84,9 +85,16 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
8485
f" examples but only has {len(dataset)}."
8586
)
8687

88+
# If running in distributed mode (N machines) then each one should skip N-1
89+
# samples
90+
step = mx.distributed.init().size()
91+
if batch_size % step != 0:
92+
raise ValueError("The batch size must be divisible by the number of workers")
93+
8794
# Make the batches:
8895
batch_idx = [
89-
idx[i : i + batch_size] for i in range(0, len(idx) - batch_size + 1, batch_size)
96+
idx[i : i + batch_size : step]
97+
for i in range(0, len(idx) - batch_size + 1, batch_size)
9098
]
9199

92100
while True:
@@ -112,9 +120,9 @@ def iterate_batches(dataset, tokenizer, batch_size, max_seq_length, train=False)
112120
max_length_in_batch = pad_to * ((max(lengths) + pad_to - 1) // pad_to)
113121
max_length_in_batch = min(max_length_in_batch, max_seq_length)
114122

115-
batch_arr = np.zeros((batch_size, max_length_in_batch), np.int32)
123+
batch_arr = np.zeros((batch_size // step, max_length_in_batch), np.int32)
116124

117-
for j in range(batch_size):
125+
for j in range(batch_size // step):
118126
truncated_length = min(lengths[j], max_seq_length)
119127
batch_arr[j, :truncated_length] = batch[j][:truncated_length]
120128
lengths[j] = (
@@ -138,7 +146,7 @@ def evaluate(
138146
loss: callable = default_loss,
139147
iterate_batches: callable = iterate_batches,
140148
):
141-
all_losses = []
149+
all_losses = 0
142150
ntokens = 0
143151

144152
index_iterator = iter(range(num_batches)) if num_batches != -1 else iter(int, 1)
@@ -153,10 +161,14 @@ def evaluate(
153161
),
154162
):
155163
losses, toks = loss(model, *batch)
156-
all_losses.append((losses * toks).item())
157-
ntokens += toks.item()
164+
all_losses += losses * toks
165+
ntokens += toks
166+
mx.eval(all_losses, ntokens)
167+
168+
all_losses = mx.distributed.all_sum(all_losses)
169+
ntokens = mx.distributed.all_sum(ntokens)
158170

159-
return np.sum(all_losses) / ntokens
171+
return (all_losses / ntokens).item()
160172

161173

162174
class TrainingCallback:
@@ -182,6 +194,11 @@ def train(
182194
training_callback: TrainingCallback = None,
183195
):
184196
print(f"Starting training..., iters: {args.iters}")
197+
world = mx.distributed.init()
198+
world_size = world.size()
199+
rank = world.rank()
200+
if world_size > 1:
201+
print(f"Node {rank} of {world_size}")
185202

186203
if args.grad_checkpoint:
187204
grad_checkpoint(model.layers[0])
@@ -192,15 +209,19 @@ def step(batch):
192209
# Forward and backward pass
193210
(lvalue, toks), grad = loss_value_and_grad(model, *batch)
194211

212+
# All reduce the gradients if running in distributed mode
213+
grad = average_gradients(grad)
214+
195215
# Model update
196216
optimizer.update(model, grad)
197217

198218
return lvalue, toks
199219

200220
loss_value_and_grad = nn.value_and_grad(model, loss)
201221

202-
losses = []
222+
losses = 0
203223
n_tokens = 0
224+
steps = 0
204225
trained_tokens = 0
205226
# Main training loop
206227
start = time.perf_counter()
@@ -229,9 +250,13 @@ def step(batch):
229250
iterate_batches=iterate_batches,
230251
)
231252
val_time = time.perf_counter() - stop
232-
print(
233-
f"Iter {it}: " f"Val loss {val_loss:.3f}, " f"Val took {val_time:.3f}s"
234-
)
253+
if rank == 0:
254+
print(
255+
f"Iter {it}: "
256+
f"Val loss {val_loss:.3f}, "
257+
f"Val took {val_time:.3f}s",
258+
flush=True,
259+
)
235260

236261
if training_callback is not None:
237262
val_info = {
@@ -244,30 +269,33 @@ def step(batch):
244269
start = time.perf_counter()
245270

246271
lvalue, toks = step(batch)
247-
mx.eval(state, lvalue, toks)
248-
249-
# Record loss
250-
losses.append(lvalue.item())
251-
n_tokens += toks.item()
272+
losses += lvalue
273+
n_tokens += toks
274+
steps += 1
275+
mx.eval(state, losses, n_tokens)
252276

253277
# Report training loss if needed
254278
if it % args.steps_per_report == 0 or it == args.iters:
255279
stop = time.perf_counter()
256280

257-
train_loss = np.mean(losses)
281+
train_loss = mx.distributed.all_sum(losses).item()
282+
train_loss /= steps * mx.distributed.init().size()
283+
n_tokens = mx.distributed.all_sum(n_tokens).item()
258284
learning_rate = optimizer.learning_rate.item()
259285
it_sec = args.steps_per_report / (stop - start)
260286
tokens_sec = float(n_tokens) / (stop - start)
261287
trained_tokens += n_tokens
262288
peak_mem = mx.metal.get_peak_memory() / 2**30
263-
print(
264-
f"Iter {it}: Train loss {train_loss:.3f}, "
265-
f"Learning Rate {learning_rate:.3e}, "
266-
f"It/sec {it_sec:.3f}, "
267-
f"Tokens/sec {tokens_sec:.3f}, "
268-
f"Trained Tokens {trained_tokens}, "
269-
f"Peak mem {peak_mem:.3f} GB"
270-
)
289+
if rank == 0:
290+
print(
291+
f"Iter {it}: Train loss {train_loss:.3f}, "
292+
f"Learning Rate {learning_rate:.3e}, "
293+
f"It/sec {it_sec:.3f}, "
294+
f"Tokens/sec {tokens_sec:.3f}, "
295+
f"Trained Tokens {trained_tokens}, "
296+
f"Peak mem {peak_mem:.3f} GB",
297+
flush=True,
298+
)
271299

272300
if training_callback is not None:
273301
train_info = {
@@ -281,8 +309,9 @@ def step(batch):
281309
}
282310
training_callback.on_train_loss_report(train_info)
283311

284-
losses = []
312+
losses = 0
285313
n_tokens = 0
314+
steps = 0
286315
start = time.perf_counter()
287316

288317
# Save adapter weights

llms/tests/test_finetune.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import math
44
import sys
55
import unittest
6+
from contextlib import contextmanager
67
from io import StringIO
78
from unittest.mock import MagicMock
89

@@ -17,6 +18,14 @@
1718
from mlx_lm.tuner.utils import build_schedule
1819

1920

21+
@contextmanager
22+
def swapped_with_identity(obj, func):
23+
old_func = getattr(obj, func)
24+
setattr(obj, func, lambda x: x)
25+
yield
26+
setattr(obj, func, old_func)
27+
28+
2029
class TestLora(unittest.TestCase):
2130
def setUp(self):
2231
self.capturedOutput = StringIO()
@@ -374,16 +383,17 @@ def test_evaluate_calls(self):
374383
(MagicMock(return_value=0.4), MagicMock(return_value=180)),
375384
(MagicMock(return_value=0.6), MagicMock(return_value=120)),
376385
]
377-
evaluate(
378-
model=mock_model,
379-
dataset=mock_dataset,
380-
tokenizer=mock_tokenizer,
381-
batch_size=2,
382-
num_batches=2,
383-
max_seq_length=2048,
384-
loss=mock_default_loss,
385-
iterate_batches=mock_iterate_batches,
386-
)
386+
with swapped_with_identity(mx.distributed, "all_sum"):
387+
evaluate(
388+
model=mock_model,
389+
dataset=mock_dataset,
390+
tokenizer=mock_tokenizer,
391+
batch_size=2,
392+
num_batches=2,
393+
max_seq_length=2048,
394+
loss=mock_default_loss,
395+
iterate_batches=mock_iterate_batches,
396+
)
387397

388398
mock_iterate_batches.assert_called_once_with(
389399
dataset=mock_dataset,
@@ -412,16 +422,17 @@ def test_evaluate_infinite_batches(self):
412422
(MagicMock(return_value=0.2), MagicMock(return_value=150)),
413423
]
414424

415-
evaluate(
416-
model=mock_model,
417-
dataset=mock_dataset,
418-
tokenizer=mock_tokenizer,
419-
batch_size=2,
420-
num_batches=-1,
421-
max_seq_length=2048,
422-
loss=mock_default_loss,
423-
iterate_batches=mock_iterate_batches,
424-
)
425+
with swapped_with_identity(mx.distributed, "all_sum"):
426+
evaluate(
427+
model=mock_model,
428+
dataset=mock_dataset,
429+
tokenizer=mock_tokenizer,
430+
batch_size=2,
431+
num_batches=-1,
432+
max_seq_length=2048,
433+
loss=mock_default_loss,
434+
iterate_batches=mock_iterate_batches,
435+
)
425436

426437
mock_iterate_batches.assert_called_once_with(
427438
dataset=mock_dataset,

0 commit comments

Comments
 (0)