26
26
from sagemaker .tensorflow import TensorFlow
27
27
28
28
dir_path = os .path .dirname (os .path .realpath (__file__ ))
29
- benchmark_results_dir = os .path .join (' s3://' , Session ().default_bucket (), ' hvd-benchmarking' )
29
+ benchmark_results_dir = os .path .join (" s3://" , Session ().default_bucket (), " hvd-benchmarking" )
30
30
31
31
32
32
@click .group ()
@@ -35,130 +35,133 @@ def cli():
35
35
36
36
37
37
def generate_report ():
38
- results_dir = os .path .join (dir_path , ' results' )
38
+ results_dir = os .path .join (dir_path , " results" )
39
39
40
40
if os .path .exists (results_dir ):
41
41
shutil .rmtree (results_dir )
42
42
43
- subprocess .call ([' aws' , 's3' , 'cp' , ' --recursive' , benchmark_results_dir , results_dir ])
43
+ subprocess .call ([" aws" , "s3" , "cp" , " --recursive" , benchmark_results_dir , results_dir ])
44
44
45
45
jobs = {}
46
46
47
47
for job_name in os .listdir (results_dir ):
48
48
jobs [job_name ] = {}
49
49
50
- _ , instance_type , instance_count , device , py_version , _ , _ , _ , _ , _ , _ , _ = job_name .split ('-' )
50
+ _ , instance_type , instance_count , device , py_version , _ , _ , _ , _ , _ , _ , _ = job_name .split (
51
+ "-"
52
+ )
51
53
52
54
current_dir = os .path .join (results_dir , job_name )
53
55
54
- model_dir = os .path .join (current_dir , ' output' , ' model.tar.gz' )
55
- subprocess .call ([' tar' , ' -xvzf' , model_dir ], cwd = current_dir )
56
+ model_dir = os .path .join (current_dir , " output" , " model.tar.gz" )
57
+ subprocess .call ([" tar" , " -xvzf" , model_dir ], cwd = current_dir )
56
58
57
- jobs [job_name ][' instance_type' ] = instance_type
58
- jobs [job_name ][' instance_count' ] = instance_count
59
- jobs [job_name ][' device' ] = device
60
- jobs [job_name ][' py_version' ] = py_version
59
+ jobs [job_name ][" instance_type" ] = instance_type
60
+ jobs [job_name ][" instance_count" ] = instance_count
61
+ jobs [job_name ][" device" ] = device
62
+ jobs [job_name ][" py_version" ] = py_version
61
63
62
- benchmark_log = os .path .join (current_dir , ' benchmark_run.log' )
64
+ benchmark_log = os .path .join (current_dir , " benchmark_run.log" )
63
65
64
66
if os .path .exists (benchmark_log ):
65
67
with open (benchmark_log ) as f :
66
68
data = json .load (f )
67
69
68
-
69
- jobs [job_name ]['dataset' ] = data ['dataset' ]['name' ]
70
- jobs [job_name ]['num_cores' ] = data ['machine_config' ]['cpu_info' ]['num_cores' ]
71
- jobs [job_name ]['cpu_info' ] = data ['machine_config' ]['cpu_info' ]['cpu_info' ]
72
- jobs [job_name ]['mhz_per_cpu' ] = data ['machine_config' ]['cpu_info' ]['mhz_per_cpu' ]
73
- jobs [job_name ]['gpu_count' ] = data ['machine_config' ]['gpu_info' ]['count' ]
74
- jobs [job_name ]['gpu_model' ] = data ['machine_config' ]['gpu_info' ]['model' ]
70
+ jobs [job_name ]["dataset" ] = data ["dataset" ]["name" ]
71
+ jobs [job_name ]["num_cores" ] = data ["machine_config" ]["cpu_info" ]["num_cores" ]
72
+ jobs [job_name ]["cpu_info" ] = data ["machine_config" ]["cpu_info" ]["cpu_info" ]
73
+ jobs [job_name ]["mhz_per_cpu" ] = data ["machine_config" ]["cpu_info" ]["mhz_per_cpu" ]
74
+ jobs [job_name ]["gpu_count" ] = data ["machine_config" ]["gpu_info" ]["count" ]
75
+ jobs [job_name ]["gpu_model" ] = data ["machine_config" ]["gpu_info" ]["model" ]
75
76
76
77
def find_value (parameter ):
77
- other_key = [k for k in parameter if k != ' name' ][0 ]
78
+ other_key = [k for k in parameter if k != " name" ][0 ]
78
79
return parameter [other_key ]
79
80
80
- for parameter in data [' run_parameters' ]:
81
- jobs [job_name ][parameter [' name' ]] = find_value (parameter )
81
+ for parameter in data [" run_parameters" ]:
82
+ jobs [job_name ][parameter [" name" ]] = find_value (parameter )
82
83
83
- jobs [job_name ]['model_name' ] = data ['model_name' ]
84
- jobs [job_name ]['run_date' ] = data ['run_date' ]
85
- jobs [job_name ]['tensorflow_version' ] = data ['tensorflow_version' ]['version' ]
86
- jobs [job_name ]['tensorflow_version_git_hash' ] = data ['tensorflow_version' ]['git_hash' ]
84
+ jobs [job_name ]["model_name" ] = data ["model_name" ]
85
+ jobs [job_name ]["run_date" ] = data ["run_date" ]
86
+ jobs [job_name ]["tensorflow_version" ] = data ["tensorflow_version" ]["version" ]
87
+ jobs [job_name ]["tensorflow_version_git_hash" ] = data ["tensorflow_version" ][
88
+ "git_hash"
89
+ ]
87
90
88
91
return pd .DataFrame (jobs )
89
92
90
93
91
- @cli .command ('train' )
92
- @click .option ('--framework-version' , required = True , type = click .Choice (['1.11' , '1.12' ]))
93
- @click .option ('--device' , required = True , type = click .Choice (['cpu' , 'gpu' ]))
94
- @click .option ('--py-versions' , multiple = True , type = str )
95
- @click .option ('--training-input-mode' , default = 'File' , type = click .Choice (['File' , 'Pipe' ]))
96
- @click .option ('--networking-isolation/--no-networking-isolation' , default = False )
97
- @click .option ('--wait/--no-wait' , default = False )
98
- @click .option ('--security-groups' , multiple = True , type = str )
99
- @click .option ('--subnets' , multiple = True , type = str )
100
- @click .option ('--role' , default = 'SageMakerRole' , type = str )
101
- @click .option ('--instance-counts' , multiple = True , type = int )
102
- @click .option ('--instance-types' , multiple = True , type = str )
103
- @click .argument ('script_args' , nargs = - 1 , type = str )
104
- def train (framework_version ,
105
- device ,
106
- py_versions ,
107
- training_input_mode ,
108
- networking_isolation ,
109
- wait ,
110
- security_groups ,
111
- subnets ,
112
- role ,
113
- instance_counts ,
114
- instance_types ,
115
- script_args ):
94
+ @cli .command ("train" )
95
+ @click .option ("--framework-version" , required = True , type = click .Choice (["1.11" , "1.12" ]))
96
+ @click .option ("--device" , required = True , type = click .Choice (["cpu" , "gpu" ]))
97
+ @click .option ("--py-versions" , multiple = True , type = str )
98
+ @click .option ("--training-input-mode" , default = "File" , type = click .Choice (["File" , "Pipe" ]))
99
+ @click .option ("--networking-isolation/--no-networking-isolation" , default = False )
100
+ @click .option ("--wait/--no-wait" , default = False )
101
+ @click .option ("--security-groups" , multiple = True , type = str )
102
+ @click .option ("--subnets" , multiple = True , type = str )
103
+ @click .option ("--role" , default = "SageMakerRole" , type = str )
104
+ @click .option ("--instance-counts" , multiple = True , type = int )
105
+ @click .option ("--instance-types" , multiple = True , type = str )
106
+ @click .argument ("script_args" , nargs = - 1 , type = str )
107
+ def train (
108
+ framework_version ,
109
+ device ,
110
+ py_versions ,
111
+ training_input_mode ,
112
+ networking_isolation ,
113
+ wait ,
114
+ security_groups ,
115
+ subnets ,
116
+ role ,
117
+ instance_counts ,
118
+ instance_types ,
119
+ script_args ,
120
+ ):
116
121
iterator = itertools .product (instance_types , py_versions , instance_counts )
117
122
for instance_type , py_version , instance_count in iterator :
118
123
base_name = job_name (instance_type , instance_count , device , py_version )
119
124
120
- mpi_options = ' -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog'
125
+ mpi_options = " -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog"
121
126
estimator = TensorFlow (
122
- entry_point = os .path .join (dir_path , ' train.sh' ),
127
+ entry_point = os .path .join (dir_path , " train.sh" ),
123
128
role = role ,
124
- dependencies = [os .path .join (dir_path , ' train_imagenet_resnet_hvd.py' )],
129
+ dependencies = [os .path .join (dir_path , " train_imagenet_resnet_hvd.py" )],
125
130
base_job_name = base_name ,
126
131
train_instance_count = instance_count ,
127
132
train_instance_type = instance_type ,
128
133
framework_version = framework_version ,
129
134
py_version = py_version ,
130
135
script_mode = True ,
131
136
hyperparameters = {
132
- ' sagemaker_mpi_enabled' : True ,
133
- ' sagemaker_mpi_num_of_processes_per_host' : 8 ,
134
- ' sagemaker_mpi_custom_mpi_options' : mpi_options
137
+ " sagemaker_mpi_enabled" : True ,
138
+ " sagemaker_mpi_num_of_processes_per_host" : 8 ,
139
+ " sagemaker_mpi_custom_mpi_options" : mpi_options ,
135
140
},
136
141
output_path = benchmark_results_dir ,
137
142
security_group_ids = security_groups ,
138
- subnets = subnets
143
+ subnets = subnets ,
139
144
)
140
145
141
146
estimator .fit (wait = wait )
142
147
143
148
if wait :
144
- artifacts_path = os .path .join (dir_path , 'results' ,
145
- estimator .latest_training_job .job_name )
146
- model_path = os .path .join (artifacts_path , 'model.tar.gz' )
149
+ artifacts_path = os .path .join (
150
+ dir_path , "results" , estimator .latest_training_job .job_name
151
+ )
152
+ model_path = os .path .join (artifacts_path , "model.tar.gz" )
147
153
os .makedirs (artifacts_path )
148
- subprocess .call (['aws' , 's3' , 'cp' , estimator .model_data , model_path ])
149
- subprocess .call (['tar' , '-xvzf' , model_path ], cwd = artifacts_path )
154
+ subprocess .call (["aws" , "s3" , "cp" , estimator .model_data , model_path ])
155
+ subprocess .call (["tar" , "-xvzf" , model_path ], cwd = artifacts_path )
156
+
157
+ print ("Model downloaded at %s" % model_path )
150
158
151
- print ('Model downloaded at %s' % model_path )
152
159
160
+ def job_name (instance_type , instance_count , device , python_version ):
161
+ instance_typename = instance_type .replace ("." , "" ).replace ("ml" , "" )
153
162
154
- def job_name (instance_type ,
155
- instance_count ,
156
- device ,
157
- python_version ):
158
- instance_typename = instance_type .replace ('.' , '' ).replace ('ml' , '' )
163
+ return "hvd-%s-%s-%s-%s" % (instance_typename , instance_count , device , python_version )
159
164
160
- return 'hvd-%s-%s-%s-%s' % (
161
- instance_typename , instance_count , device , python_version )
162
165
163
- if __name__ == ' __main__' :
166
+ if __name__ == " __main__" :
164
167
cli ()
0 commit comments