Skip to content

Commit c9997d6

Browse files
authored
Fixes ray lazy metric reporting and hanging processes (#2346)
# Description The step() function of ray/tuner.py has some issues preventing one from having an uninterrupted ray hyperparameter tuning session. Please refer to #2328 for details. Fixes #2328. ## Type of change - Bug fix (non-breaking change which fixes an issue) ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [ ] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
1 parent 82cb320 commit c9997d6

File tree

5 files changed

+212
-52
lines changed

5 files changed

+212
-52
lines changed

scripts/reinforcement_learning/ray/tuner.py

+114-8
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import argparse
66
import importlib.util
77
import os
8+
import subprocess
89
import sys
9-
from time import sleep
10+
from time import sleep, time
1011

1112
import ray
1213
import util
@@ -57,6 +58,9 @@
5758
PYTHON_EXEC = "./isaaclab.sh -p"
5859
WORKFLOW = "scripts/reinforcement_learning/rl_games/train.py"
5960
NUM_WORKERS_PER_NODE = 1 # needed for local parallelism
61+
PROCESS_RESPONSE_TIMEOUT = 200.0 # seconds to wait before killing the process when it stops responding
62+
MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS = 1000 # maximum number of lines to read from the training process logs
63+
MAX_LOG_EXTRACTION_ERRORS = 2 # maximum allowed LogExtractionErrors before we abort the whole training
6064

6165

6266
class IsaacLabTuneTrainable(tune.Trainable):
@@ -70,6 +74,7 @@ class IsaacLabTuneTrainable(tune.Trainable):
7074
def setup(self, config: dict) -> None:
7175
"""Get the invocation command, return quick for easy scheduling."""
7276
self.data = None
77+
self.time_since_last_proc_response = 0.0
7378
self.invoke_cmd = util.get_invocation_command_from_cfg(cfg=config, python_cmd=PYTHON_EXEC, workflow=WORKFLOW)
7479
print(f"[INFO]: Recovered invocation with {self.invoke_cmd}")
7580
self.experiment = None
@@ -84,12 +89,21 @@ def step(self) -> dict:
8489
# When including this as first step instead of setup, experiments get scheduled faster
8590
# Don't want to block the scheduler while the experiment spins up
8691
print(f"[INFO]: Invoking experiment as first step with {self.invoke_cmd}...")
87-
experiment = util.execute_job(
88-
self.invoke_cmd,
89-
identifier_string="",
90-
extract_experiment=True,
91-
persistent_dir=BASE_DIR,
92-
)
92+
try:
93+
experiment = util.execute_job(
94+
self.invoke_cmd,
95+
identifier_string="",
96+
extract_experiment=True, # Keep this as True to return a valid dictionary
97+
persistent_dir=BASE_DIR,
98+
max_lines_to_search_logs=MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS,
99+
max_time_to_search_logs=PROCESS_RESPONSE_TIMEOUT,
100+
)
101+
except util.LogExtractionError:
102+
self.data = {
103+
"LOG_EXTRACTION_ERROR_STOPPER_FLAG": True,
104+
"done": True,
105+
}
106+
return self.data
93107
self.experiment = experiment
94108
print(f"[INFO]: Tuner recovered experiment info {experiment}")
95109
self.proc = experiment["proc"]
@@ -109,11 +123,35 @@ def step(self) -> dict:
109123

110124
while data is None:
111125
data = util.load_tensorboard_logs(self.tensorboard_logdir)
126+
proc_status = self.proc.poll()
127+
if proc_status is not None:
128+
break
112129
sleep(2) # Lazy report metrics to avoid performance overhead
113130

114131
if self.data is not None:
115-
while util._dicts_equal(data, self.data):
132+
data_ = {k: v for k, v in data.items() if k != "done"}
133+
self_data_ = {k: v for k, v in self.data.items() if k != "done"}
134+
unresponsiveness_start_time = time()
135+
while util._dicts_equal(data_, self_data_):
136+
self.time_since_last_proc_response = time() - unresponsiveness_start_time
116137
data = util.load_tensorboard_logs(self.tensorboard_logdir)
138+
data_ = {k: v for k, v in data.items() if k != "done"}
139+
proc_status = self.proc.poll()
140+
if proc_status is not None:
141+
break
142+
if self.time_since_last_proc_response > PROCESS_RESPONSE_TIMEOUT:
143+
self.time_since_last_proc_response = 0.0
144+
print("[WARNING]: Training workflow process is not responding, terminating...")
145+
self.proc.terminate()
146+
try:
147+
self.proc.wait(timeout=20)
148+
except subprocess.TimeoutExpired:
149+
print("[ERROR]: The process did not terminate within timeout duration.")
150+
self.proc.kill()
151+
self.proc.wait()
152+
self.data = data
153+
self.data["done"] = True
154+
return self.data
117155
sleep(2) # Lazy report metrics to avoid performance overhead
118156

119157
self.data = data
@@ -132,6 +170,39 @@ def default_resource_request(self):
132170
)
133171

134172

173+
class LogExtractionErrorStopper(tune.Stopper):
174+
"""Stopper that stops all trials if multiple LogExtractionErrors occur.
175+
176+
Args:
177+
max_errors: The maximum number of LogExtractionErrors allowed before terminating the experiment.
178+
"""
179+
180+
def __init__(self, max_errors: int):
181+
self.max_errors = max_errors
182+
self.error_count = 0
183+
184+
def __call__(self, trial_id, result):
185+
"""Increments the error count if trial has encountered a LogExtractionError.
186+
187+
It does not stop the trial based on the metrics, always returning False.
188+
"""
189+
if result.get("LOG_EXTRACTION_ERROR_STOPPER_FLAG", False):
190+
self.error_count += 1
191+
print(
192+
f"[ERROR]: Encountered LogExtractionError {self.error_count} times. "
193+
f"Maximum allowed is {self.max_errors}."
194+
)
195+
return False
196+
197+
def stop_all(self):
198+
"""Returns true if number of LogExtractionErrors exceeds the maximum allowed, terminating the experiment."""
199+
if self.error_count > self.max_errors:
200+
print("[FATAL]: Encountered LogExtractionError more than allowed, aborting entire tuning run... ")
201+
return True
202+
else:
203+
return False
204+
205+
135206
def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
136207
"""Invoke an Isaac-Ray tuning run.
137208
@@ -175,6 +246,7 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
175246
checkpoint_frequency=0, # Disable periodic checkpointing
176247
checkpoint_at_end=False, # Disable final checkpoint
177248
),
249+
stop=LogExtractionErrorStopper(max_errors=MAX_LOG_EXTRACTION_ERRORS),
178250
)
179251

180252
elif args.run_mode == "remote": # MLFlow, to MLFlow server
@@ -190,6 +262,7 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
190262
storage_path="/tmp/ray",
191263
callbacks=[mlflow_callback],
192264
checkpoint_config=ray.train.CheckpointConfig(checkpoint_frequency=0, checkpoint_at_end=False),
265+
stop=LogExtractionErrorStopper(max_errors=MAX_LOG_EXTRACTION_ERRORS),
193266
)
194267
else:
195268
raise ValueError("Unrecognized run mode.")
@@ -199,6 +272,8 @@ def invoke_tuning_run(cfg: dict, args: argparse.Namespace) -> None:
199272
IsaacLabTuneTrainable,
200273
param_space=cfg,
201274
tune_config=tune.TuneConfig(
275+
metric=args.metric,
276+
mode=args.mode,
202277
search_alg=repeat_search,
203278
num_samples=args.num_samples,
204279
reuse_actors=True,
@@ -306,8 +381,39 @@ def __init__(self, cfg: dict):
306381
default=3,
307382
help="How many times to repeat each hyperparameter config.",
308383
)
384+
parser.add_argument(
385+
"--process_response_timeout",
386+
type=float,
387+
default=PROCESS_RESPONSE_TIMEOUT,
388+
help="Training workflow process response timeout.",
389+
)
390+
parser.add_argument(
391+
"--max_lines_to_search_experiment_logs",
392+
type=float,
393+
default=MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS,
394+
help="Max number of lines to search for experiment logs before terminating the training workflow process.",
395+
)
396+
parser.add_argument(
397+
"--max_log_extraction_errors",
398+
type=float,
399+
default=MAX_LOG_EXTRACTION_ERRORS,
400+
help="Max number number of LogExtractionError failures before we abort the whole tuning run.",
401+
)
309402

310403
args = parser.parse_args()
404+
PROCESS_RESPONSE_TIMEOUT = args.process_response_timeout
405+
MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS = int(args.max_lines_to_search_experiment_logs)
406+
print(
407+
"[INFO]: The max number of lines to search for experiment logs before (early) terminating the training "
408+
f"workflow process is set to {MAX_LINES_TO_SEARCH_EXPERIMENT_LOGS}.\n"
409+
"[INFO]: The process response timeout, used while updating tensorboard scalars and searching for "
410+
f"experiment logs, is set to {PROCESS_RESPONSE_TIMEOUT} seconds."
411+
)
412+
MAX_LOG_EXTRACTION_ERRORS = int(args.max_log_extraction_errors)
413+
print(
414+
"[INFO]: Max number of LogExtractionError failures before we abort the whole tuning run is "
415+
f"set to {MAX_LOG_EXTRACTION_ERRORS}.\n"
416+
)
311417
NUM_WORKERS_PER_NODE = args.num_workers_per_node
312418
print(f"[INFO]: Using {NUM_WORKERS_PER_NODE} workers per node.")
313419
if args.run_mode == "remote":

scripts/reinforcement_learning/ray/util.py

+94-42
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import argparse
66
import os
77
import re
8+
import select
89
import subprocess
910
import threading
1011
from datetime import datetime
1112
from math import isclose
13+
from time import time
1214

1315
import ray
1416
from tensorboard.backend.event_processing.directory_watcher import DirectoryDeletedError
@@ -26,14 +28,20 @@ def load_tensorboard_logs(directory: str) -> dict:
2628
The latest available scalar values.
2729
"""
2830

31+
# replace any non-alnum/underscore/dot with "_", then collapse runs of "_"
32+
def replace_invalid_chars(t):
33+
t2 = re.sub(r"[^0-9A-Za-z_./]", "_", t)
34+
t2 = re.sub(r"_+", "_", t2)
35+
return t2.strip("_")
36+
2937
# Initialize the event accumulator with a size guidance for only the latest entry
3038
def get_latest_scalars(path: str) -> dict:
3139
event_acc = EventAccumulator(path, size_guidance={"scalars": 1})
3240
try:
3341
event_acc.Reload()
3442
if event_acc.Tags()["scalars"]:
3543
return {
36-
tag: event_acc.Scalars(tag)[-1].value
44+
replace_invalid_chars(tag): event_acc.Scalars(tag)[-1].value
3745
for tag in event_acc.Tags()["scalars"]
3846
if event_acc.Scalars(tag)
3947
}
@@ -98,13 +106,21 @@ def remote_execute_job(
98106
)
99107

100108

109+
class LogExtractionError(Exception):
110+
"""Raised when we cannot extract experiment_name/logdir from the trainer output."""
111+
112+
pass
113+
114+
101115
def execute_job(
102116
job_cmd: str,
103117
identifier_string: str = "job 0",
104118
test_mode: bool = False,
105119
extract_experiment: bool = False,
106120
persistent_dir: str | None = None,
107121
log_all_output: bool = False,
122+
max_lines_to_search_logs: int = 1000,
123+
max_time_to_search_logs: float = 200.0,
108124
) -> str | dict:
109125
"""Issue a job (shell command).
110126
@@ -117,6 +133,8 @@ def execute_job(
117133
persistent_dir: When supplied, change to run the directory in a persistent
118134
directory. Can be used to avoid losing logs in the /tmp directory. Defaults to None.
119135
log_all_output: When true, print all output to the console. Defaults to False.
136+
max_lines_to_search_logs: Maximum number of lines to search for experiment info. Defaults to 1000.
137+
max_time_to_search_logs: Maximum time to wait for experiment info before giving up. Defaults to 200.0 seconds.
120138
Raises:
121139
ValueError: If the job is unable to start, or throws an error. Most likely to happen
122140
due to running out of memory.
@@ -190,6 +208,8 @@ def execute_job(
190208
process = subprocess.Popen(
191209
job_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1
192210
)
211+
process_file_descriptor = process.stdout.fileno()
212+
193213
if persistent_dir:
194214
os.chdir(og_dir)
195215
experiment_name = None
@@ -205,48 +225,80 @@ def stream_reader(stream, identifier_string, result_details):
205225
if log_all_output:
206226
print(f"{identifier_string}: {line}")
207227

208-
# Read stdout until we find experiment info
228+
# Read stdout until we find exp. info, up to max_lines_to_search_logs lines, max_time_to_search_logs, or EOF.
209229
# Do some careful handling prevent overflowing the pipe reading buffer with error 141
210-
for line in iter(process.stdout.readline, ""):
211-
line = line.strip()
212-
result_details.append(f"{identifier_string}: {line} \n")
213-
if log_all_output:
214-
print(f"{identifier_string}: {line}")
215-
216-
if extract_experiment:
217-
exp_match = experiment_info_pattern.search(line)
218-
log_match = logdir_pattern.search(line)
219-
err_match = err_pattern.search(line)
220-
221-
if err_match:
222-
raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
223-
224-
if exp_match:
225-
experiment_name = exp_match.group(1)
226-
if log_match:
227-
logdir = log_match.group(1)
228-
229-
if experiment_name and logdir:
230-
# Start stderr reader after finding experiment info
231-
stderr_thread = threading.Thread(
232-
target=stream_reader, args=(process.stderr, identifier_string, result_details)
233-
)
234-
stderr_thread.daemon = True
235-
stderr_thread.start()
236-
237-
# Start stdout reader to continue reading to flush buffer
238-
stdout_thread = threading.Thread(
239-
target=stream_reader, args=(process.stdout, identifier_string, result_details)
240-
)
241-
stdout_thread.daemon = True
242-
stdout_thread.start()
243-
244-
return {
245-
"experiment_name": experiment_name,
246-
"logdir": logdir,
247-
"proc": process,
248-
"result": " ".join(result_details),
249-
}
230+
lines_read = 0
231+
search_duration = 0.0
232+
search_start_time = time()
233+
while True:
234+
new_line_ready, _, _ = select.select([process_file_descriptor], [], [], 1.0) # Wait up to 1s for stdout
235+
if new_line_ready:
236+
line = process.stdout.readline()
237+
if not line: # EOF
238+
break
239+
240+
lines_read += 1
241+
line = line.strip()
242+
result_details.append(f"{identifier_string}: {line} \n")
243+
244+
if log_all_output:
245+
print(f"{identifier_string}: {line}")
246+
247+
if extract_experiment:
248+
exp_match = experiment_info_pattern.search(line)
249+
log_match = logdir_pattern.search(line)
250+
err_match = err_pattern.search(line)
251+
252+
if err_match:
253+
raise ValueError(f"Encountered an error during trial run. {' '.join(result_details)}")
254+
255+
if exp_match:
256+
experiment_name = exp_match.group(1)
257+
if log_match:
258+
logdir = log_match.group(1)
259+
260+
if experiment_name and logdir:
261+
# Start stderr reader after finding experiment info
262+
stderr_thread = threading.Thread(
263+
target=stream_reader, args=(process.stderr, identifier_string, result_details)
264+
)
265+
stderr_thread.daemon = True
266+
stderr_thread.start()
267+
268+
# Start stdout reader to continue reading to flush buffer
269+
stdout_thread = threading.Thread(
270+
target=stream_reader, args=(process.stdout, identifier_string, result_details)
271+
)
272+
stdout_thread.daemon = True
273+
stdout_thread.start()
274+
275+
return {
276+
"experiment_name": experiment_name,
277+
"logdir": logdir,
278+
"proc": process,
279+
"result": " ".join(result_details),
280+
}
281+
282+
if extract_experiment: # if we are looking for experiment info, check for timeouts and line limits
283+
search_duration = time() - search_start_time
284+
if search_duration > max_time_to_search_logs:
285+
print(f"[ERROR]: Could not find experiment logs within {max_time_to_search_logs} seconds.")
286+
break
287+
if lines_read >= max_lines_to_search_logs:
288+
print(f"[ERROR]: Could not find experiment logs within first {max_lines_to_search_logs} lines.")
289+
break
290+
291+
# If we reach here, we didn't find experiment info in the output
292+
if extract_experiment and not (experiment_name and logdir):
293+
error_msg = (
294+
"Could not extract experiment_name/logdir from trainer output "
295+
f"(experiment_name={experiment_name!r}, logdir={logdir!r}).\n"
296+
"\tMake sure your training script prints the following correctly:\n"
297+
"\t\tExact experiment name requested from command line: <name>\n"
298+
"\t\t[INFO] Logging experiment in directory: <logdir>\n\n"
299+
)
300+
print(f"[ERROR]: {error_msg}")
301+
raise LogExtractionError("Could not extract experiment_name/logdir from training workflow output.")
250302
process.wait()
251303
now = datetime.now().strftime("%H:%M:%S.%f")
252304
completion_info = f"\n[INFO]: {identifier_string}: Job Started at {start_time}, completed at {now}\n"

0 commit comments

Comments
 (0)