Skip to content

Commit c04ccea

Browse files
authored
fix: call entry_point.run with capture_error=True (#398)
* call entry_point.run with capture_error=True * format with black
1 parent e2bb95a commit c04ccea

File tree

23 files changed

+1677
-1291
lines changed

23 files changed

+1677
-1291
lines changed

benchmarks/horovod-resnet/execute_horovod_training.py

Lines changed: 74 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from sagemaker.tensorflow import TensorFlow
2727

2828
dir_path = os.path.dirname(os.path.realpath(__file__))
29-
benchmark_results_dir = os.path.join('s3://', Session().default_bucket(), 'hvd-benchmarking')
29+
benchmark_results_dir = os.path.join("s3://", Session().default_bucket(), "hvd-benchmarking")
3030

3131

3232
@click.group()
@@ -35,130 +35,133 @@ def cli():
3535

3636

3737
def generate_report():
38-
results_dir = os.path.join(dir_path, 'results')
38+
results_dir = os.path.join(dir_path, "results")
3939

4040
if os.path.exists(results_dir):
4141
shutil.rmtree(results_dir)
4242

43-
subprocess.call(['aws', 's3', 'cp', '--recursive', benchmark_results_dir, results_dir])
43+
subprocess.call(["aws", "s3", "cp", "--recursive", benchmark_results_dir, results_dir])
4444

4545
jobs = {}
4646

4747
for job_name in os.listdir(results_dir):
4848
jobs[job_name] = {}
4949

50-
_, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split('-')
50+
_, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split(
51+
"-"
52+
)
5153

5254
current_dir = os.path.join(results_dir, job_name)
5355

54-
model_dir = os.path.join(current_dir, 'output', 'model.tar.gz')
55-
subprocess.call(['tar', '-xvzf', model_dir], cwd=current_dir)
56+
model_dir = os.path.join(current_dir, "output", "model.tar.gz")
57+
subprocess.call(["tar", "-xvzf", model_dir], cwd=current_dir)
5658

57-
jobs[job_name]['instance_type'] = instance_type
58-
jobs[job_name]['instance_count'] = instance_count
59-
jobs[job_name]['device'] = device
60-
jobs[job_name]['py_version'] = py_version
59+
jobs[job_name]["instance_type"] = instance_type
60+
jobs[job_name]["instance_count"] = instance_count
61+
jobs[job_name]["device"] = device
62+
jobs[job_name]["py_version"] = py_version
6163

62-
benchmark_log = os.path.join(current_dir, 'benchmark_run.log')
64+
benchmark_log = os.path.join(current_dir, "benchmark_run.log")
6365

6466
if os.path.exists(benchmark_log):
6567
with open(benchmark_log) as f:
6668
data = json.load(f)
6769

68-
69-
jobs[job_name]['dataset'] = data['dataset']['name']
70-
jobs[job_name]['num_cores'] = data['machine_config']['cpu_info']['num_cores']
71-
jobs[job_name]['cpu_info'] = data['machine_config']['cpu_info']['cpu_info']
72-
jobs[job_name]['mhz_per_cpu'] = data['machine_config']['cpu_info']['mhz_per_cpu']
73-
jobs[job_name]['gpu_count'] = data['machine_config']['gpu_info']['count']
74-
jobs[job_name]['gpu_model'] = data['machine_config']['gpu_info']['model']
70+
jobs[job_name]["dataset"] = data["dataset"]["name"]
71+
jobs[job_name]["num_cores"] = data["machine_config"]["cpu_info"]["num_cores"]
72+
jobs[job_name]["cpu_info"] = data["machine_config"]["cpu_info"]["cpu_info"]
73+
jobs[job_name]["mhz_per_cpu"] = data["machine_config"]["cpu_info"]["mhz_per_cpu"]
74+
jobs[job_name]["gpu_count"] = data["machine_config"]["gpu_info"]["count"]
75+
jobs[job_name]["gpu_model"] = data["machine_config"]["gpu_info"]["model"]
7576

7677
def find_value(parameter):
77-
other_key = [k for k in parameter if k != 'name'][0]
78+
other_key = [k for k in parameter if k != "name"][0]
7879
return parameter[other_key]
7980

80-
for parameter in data['run_parameters']:
81-
jobs[job_name][parameter['name']] = find_value(parameter)
81+
for parameter in data["run_parameters"]:
82+
jobs[job_name][parameter["name"]] = find_value(parameter)
8283

83-
jobs[job_name]['model_name'] = data['model_name']
84-
jobs[job_name]['run_date'] = data['run_date']
85-
jobs[job_name]['tensorflow_version'] = data['tensorflow_version']['version']
86-
jobs[job_name]['tensorflow_version_git_hash'] = data['tensorflow_version']['git_hash']
84+
jobs[job_name]["model_name"] = data["model_name"]
85+
jobs[job_name]["run_date"] = data["run_date"]
86+
jobs[job_name]["tensorflow_version"] = data["tensorflow_version"]["version"]
87+
jobs[job_name]["tensorflow_version_git_hash"] = data["tensorflow_version"][
88+
"git_hash"
89+
]
8790

8891
return pd.DataFrame(jobs)
8992

9093

91-
@cli.command('train')
92-
@click.option('--framework-version', required=True, type=click.Choice(['1.11', '1.12']))
93-
@click.option('--device', required=True, type=click.Choice(['cpu', 'gpu']))
94-
@click.option('--py-versions', multiple=True, type=str)
95-
@click.option('--training-input-mode', default='File', type=click.Choice(['File', 'Pipe']))
96-
@click.option('--networking-isolation/--no-networking-isolation', default=False)
97-
@click.option('--wait/--no-wait', default=False)
98-
@click.option('--security-groups', multiple=True, type=str)
99-
@click.option('--subnets', multiple=True, type=str)
100-
@click.option('--role', default='SageMakerRole', type=str)
101-
@click.option('--instance-counts', multiple=True, type=int)
102-
@click.option('--instance-types', multiple=True, type=str)
103-
@click.argument('script_args', nargs=-1, type=str)
104-
def train(framework_version,
105-
device,
106-
py_versions,
107-
training_input_mode,
108-
networking_isolation,
109-
wait,
110-
security_groups,
111-
subnets,
112-
role,
113-
instance_counts,
114-
instance_types,
115-
script_args):
94+
@cli.command("train")
95+
@click.option("--framework-version", required=True, type=click.Choice(["1.11", "1.12"]))
96+
@click.option("--device", required=True, type=click.Choice(["cpu", "gpu"]))
97+
@click.option("--py-versions", multiple=True, type=str)
98+
@click.option("--training-input-mode", default="File", type=click.Choice(["File", "Pipe"]))
99+
@click.option("--networking-isolation/--no-networking-isolation", default=False)
100+
@click.option("--wait/--no-wait", default=False)
101+
@click.option("--security-groups", multiple=True, type=str)
102+
@click.option("--subnets", multiple=True, type=str)
103+
@click.option("--role", default="SageMakerRole", type=str)
104+
@click.option("--instance-counts", multiple=True, type=int)
105+
@click.option("--instance-types", multiple=True, type=str)
106+
@click.argument("script_args", nargs=-1, type=str)
107+
def train(
108+
framework_version,
109+
device,
110+
py_versions,
111+
training_input_mode,
112+
networking_isolation,
113+
wait,
114+
security_groups,
115+
subnets,
116+
role,
117+
instance_counts,
118+
instance_types,
119+
script_args,
120+
):
116121
iterator = itertools.product(instance_types, py_versions, instance_counts)
117122
for instance_type, py_version, instance_count in iterator:
118123
base_name = job_name(instance_type, instance_count, device, py_version)
119124

120-
mpi_options = '-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog'
125+
mpi_options = "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog"
121126
estimator = TensorFlow(
122-
entry_point=os.path.join(dir_path, 'train.sh'),
127+
entry_point=os.path.join(dir_path, "train.sh"),
123128
role=role,
124-
dependencies=[os.path.join(dir_path, 'train_imagenet_resnet_hvd.py')],
129+
dependencies=[os.path.join(dir_path, "train_imagenet_resnet_hvd.py")],
125130
base_job_name=base_name,
126131
train_instance_count=instance_count,
127132
train_instance_type=instance_type,
128133
framework_version=framework_version,
129134
py_version=py_version,
130135
script_mode=True,
131136
hyperparameters={
132-
'sagemaker_mpi_enabled': True,
133-
'sagemaker_mpi_num_of_processes_per_host': 8,
134-
'sagemaker_mpi_custom_mpi_options': mpi_options
137+
"sagemaker_mpi_enabled": True,
138+
"sagemaker_mpi_num_of_processes_per_host": 8,
139+
"sagemaker_mpi_custom_mpi_options": mpi_options,
135140
},
136141
output_path=benchmark_results_dir,
137142
security_group_ids=security_groups,
138-
subnets=subnets
143+
subnets=subnets,
139144
)
140145

141146
estimator.fit(wait=wait)
142147

143148
if wait:
144-
artifacts_path = os.path.join(dir_path, 'results',
145-
estimator.latest_training_job.job_name)
146-
model_path = os.path.join(artifacts_path, 'model.tar.gz')
149+
artifacts_path = os.path.join(
150+
dir_path, "results", estimator.latest_training_job.job_name
151+
)
152+
model_path = os.path.join(artifacts_path, "model.tar.gz")
147153
os.makedirs(artifacts_path)
148-
subprocess.call(['aws', 's3', 'cp', estimator.model_data, model_path])
149-
subprocess.call(['tar', '-xvzf', model_path], cwd=artifacts_path)
154+
subprocess.call(["aws", "s3", "cp", estimator.model_data, model_path])
155+
subprocess.call(["tar", "-xvzf", model_path], cwd=artifacts_path)
156+
157+
print("Model downloaded at %s" % model_path)
150158

151-
print('Model downloaded at %s' % model_path)
152159

160+
def job_name(instance_type, instance_count, device, python_version):
161+
instance_typename = instance_type.replace(".", "").replace("ml", "")
153162

154-
def job_name(instance_type,
155-
instance_count,
156-
device,
157-
python_version):
158-
instance_typename = instance_type.replace('.', '').replace('ml', '')
163+
return "hvd-%s-%s-%s-%s" % (instance_typename, instance_count, device, python_version)
159164

160-
return 'hvd-%s-%s-%s-%s' % (
161-
instance_typename, instance_count, device, python_version)
162165

163-
if __name__ == '__main__':
166+
if __name__ == "__main__":
164167
cli()

0 commit comments

Comments
 (0)