Skip to content

Commit 1ff67a1

Browse files
committed
Merge branch 'develop'
2 parents bb0cfd5 + 01a36d6 commit 1ff67a1

File tree

138 files changed

+3486
-165
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+3486
-165
lines changed

.clang-format

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,6 @@ PenaltyBreakString: 1000
9696
PenaltyExcessCharacter: 1000
9797
PenaltyReturnTypeOnItsOwnLine: 10000
9898
PointerAlignment: Right
99-
RawStringFormats:
100-
- Delimiter: pb
101-
Language: TextProto
102-
BasedOnStyle: google
10399
ReflowComments: true
104100
SortIncludes: true
105101
SortUsingDeclarations: true

.gitlab-ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ stages:
55

66
variables:
77
DOCKER_HOST: tcp://docker:2375
8+
# This will instruct Docker not to start over TLS.
9+
DOCKER_TLS_CERTDIR: ""
810
DOCKER_DRIVER: overlay2
911
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
1012
LATEST_TAG: $CI_REGISTRY_IMAGE:latest

CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Version 3.8 required from CheckCXXFeature.cmake
22
# Version 3.10 required from Boost 1.66.0 for imported target
3-
cmake_minimum_required(VERSION 3.10.0)
3+
# Version 3.13 required for target_link_options
4+
cmake_minimum_required(VERSION 3.13.0)
45

56
project(executor VERSION 1.0.0 LANGUAGES C CXX)
67

@@ -84,6 +85,9 @@ endif(WITH_TCMALLOC)
8485
find_package(nlohmann_json)
8586
set_package_properties(nlohmann_json PROPERTIES TYPE OPTIONAL PURPOSE "For OpTracing logging")
8687

88+
set(THREADS_PREFER_PTHREAD_FLAG)
89+
find_package(Threads)
90+
8791
# Bundled third party library
8892
add_subdirectory(thirdparty)
8993
#---------------------------------------------------------------------------------------
@@ -144,6 +148,10 @@ if(WITH_TIMEOUT_WARNING)
144148
set(SALUS_ENABLE_TIMEOUT_WARNING 1)
145149
endif(WITH_TIMEOUT_WARNING)
146150

151+
if(USE_TENSORFLOW)
152+
set(SALUS_ENABLE_TENSORFLOW 1)
153+
endif(USE_TENSORFLOW)
154+
147155
configure_file(src/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
148156
include_directories(${CMAKE_CURRENT_BINARY_DIR})
149157

bc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
#! /bin/bash
2-
python -m benchmarks.driver "$@"
2+
vex tfbuild python -m benchmarks.driver "$@"

benchmarks/driver/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def parse_expname(args):
156156

157157

158158
def main():
159-
# type: (Sequence[str]) -> None
159+
# type: () -> None
160160
# find first argument not starting with dash
161161
exp, argv = parse_expname(sys.argv)
162162

benchmarks/driver/runner.py

Lines changed: 212 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
from .server import SalusServer
3232
from .tfserver import TFDistServer
33-
from .utils import Popen, execute, snake_to_pascal, str2bool
33+
from .utils import Popen, execute, snake_to_pascal, str2bool, remove_suffix
3434
from .utils.compatiblity import pathlib, subprocess as sp
3535

3636
Path = pathlib.Path
@@ -40,6 +40,11 @@
4040
flags.DEFINE_string('tfbench_base', '../tf_benchmarks', 'Base dir of TFBenchmark based workloads')
4141
flags.DEFINE_string('unit_base', 'tests', 'Base dir of unittest based workloads')
4242
flags.DEFINE_string('fathom_base', '../fathom', 'Base dir of Fathom based workloads')
43+
flags.DEFINE_string('tfweb_base', '../tfweb', 'Base dir of TFWeb based workloads')
44+
flags.DEFINE_string('tfweb_saved_model_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models',
45+
'SavedModel dir of TFWeb based workloads')
46+
flags.DEFINE_string('tfweb_request_body_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/reqeusts',
47+
'Predefined request body dir for TFWeb based workloads')
4348
flags.DEFINE_boolean('no_capture', False, 'Do not capture workload outputs')
4449

4550

@@ -113,28 +118,61 @@ def __call__(self, executor, output_file):
113118
'--num_batches={}'.format(self.wl.batch_num),
114119
'--batch_size={}'.format(self.wl.batch_size),
115120
]
116-
eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', '0.1')
117-
eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5')
121+
eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', None)
122+
eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', None)
118123
eval_block = self.wl.env.pop('SALUS_TFBENCH_EVAL_BLOCK', 'true')
124+
125+
eval_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_MODEL_DIR', 'models')
126+
eval_model_dir = str(Path(eval_model_dir).joinpath(remove_suffix(self.wl.name, 'eval')))
127+
128+
eval_saved_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR', None)
129+
if eval_saved_model_dir is not None:
130+
eval_saved_model_dir = str(Path(eval_saved_model_dir).joinpath(remove_suffix(self.wl.name, 'eval')))
131+
132+
num_seconds = self.wl.env.pop('SALUS_ITER_SECONDS', None)
133+
if num_seconds is not None:
134+
cmd += [
135+
'--num_seconds={}'.format(num_seconds)
136+
]
137+
138+
wait_for_signal = self.wl.env.pop('SALUS_WAIT_FOR_SIGNAL', None)
139+
if wait_for_signal is not None:
140+
cmd += [
141+
'--wait_for_signal={}'.format(wait_for_signal)
142+
]
143+
119144
if self.wl.name.endswith('eval'):
120-
model_name = self.wl.name.rsplit('eval')[0]
145+
model_name = remove_suffix(self.wl.name, 'eval')
121146
cmd += [
122-
'--model_dir=models/{}'.format(model_name),
147+
'--model_dir=' + eval_model_dir,
123148
'--model={}'.format(model_name),
124-
'--eval_interval_secs={}'.format(eval_interval),
125-
'--eval_interval_random_factor={}'.format(eval_rand_factor),
126149
'--eval_block={}'.format(eval_block),
127150
'--eval'
128151
]
152+
if eval_interval is not None:
153+
cmd += [
154+
'--eval_interval_secs={}'.format(eval_interval),
155+
]
156+
if eval_rand_factor is not None:
157+
cmd += [
158+
'--eval_interval_random_factor={}'.format(eval_rand_factor),
159+
]
160+
if eval_saved_model_dir is not None:
161+
cmd += [
162+
'--saved_model_dir=' + eval_saved_model_dir
163+
]
129164
else:
130165
cmd += [
131166
'--model={}'.format(self.wl.name),
132167
]
133168
if str2bool(self.wl.env.pop('SALUS_SAVE_MODEL', '')):
134169
cmd += [
135-
'--model_dir=models/{}'.format(self.wl.name),
170+
'--model_dir=' + eval_model_dir,
136171
]
137172

173+
cmd += self.wl.extra_args
174+
logger.info(f'Starting workload with cmd: {cmd}')
175+
138176
if FLAGS.no_capture:
139177
return execute(cmd, cwd=str(cwd), env=self.env)
140178
else:
@@ -157,6 +195,7 @@ def __call__(self, executor, output_file):
157195
# type: (Executor, Path) -> Popen
158196
env = self.env.copy()
159197
env['EXEC_ITER_NUMBER'] = str(self.wl.batch_num)
198+
env['SALUS_BATCH_SIZE'] = str(self.wl.batch_size)
160199
if executor == Executor.TFDist:
161200
env['SALUS_TFDIST_ENDPOINT'] = TFDistServer.current_server().endpoint
162201

@@ -166,12 +205,16 @@ def __call__(self, executor, output_file):
166205
'stdbuf', '-o0', '-e0', '--',
167206
'python', '-m', pkg, method,
168207
]
208+
cmd += self.wl.extra_args
209+
210+
logger.info(f'Starting workload with cmd: {cmd}')
169211
if FLAGS.no_capture:
170212
return execute(cmd, cwd=str(cwd), env=self.env)
171213
else:
172214
output_file.parent.mkdir(exist_ok=True, parents=True)
173215
with output_file.open('w') as f:
174-
return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT)
216+
# return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT)
217+
return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=None)
175218

176219
def _construct_test_name(self, executor):
177220
# type: (Executor) -> Tuple[str, str]
@@ -197,6 +240,12 @@ def _construct_test_name(self, executor):
197240
})
198241
}
199242

243+
variable_batch_size_models = {'vae', 'superres', 'seq2seq', 'mnistsf', 'mnistcv', 'mnistlg'}
244+
if remove_suffix(self.wl.name, 'eval') not in variable_batch_size_models:
245+
if self.wl.batch_size not in self.wl.wtl.available_batch_sizes():
246+
raise ValueError(f"Batch size `{self.wl.batch_size}' is not supported for {self.wl.name},"
247+
f" available ones: {self.wl.wtl.available_batch_sizes()}")
248+
200249
if executor == Executor.Salus:
201250
prefix = 'test_rpc_'
202251
elif executor == Executor.TF:
@@ -209,19 +258,26 @@ def _construct_test_name(self, executor):
209258
if self.wl.name.endswith('eval'):
210259
prefix += 'eval_'
211260

212-
model_name = self.wl.name.rsplit('eval')[0]
261+
model_name = remove_suffix(self.wl.name, 'eval')
213262

214263
if model_name in supported_model:
215264
pkg, cls, names = supported_model[model_name]
216265
else:
217266
# fallback to guessing
218267
pkg = f'test_tf.test_{model_name}'
219268
cls = f'Test{snake_to_pascal(model_name)}'
269+
270+
# get method name
220271
names = {
221272
s: str(idx)
222273
for idx, s in enumerate(self.wl.wtl.available_batch_sizes())
223274
}
224-
method = f'{cls}.{prefix}{names[self.wl.batch_size]}'
275+
276+
postfix = names.get(self.wl.batch_size, '0')
277+
if model_name == 'seq2seq' and postfix == '0':
278+
postfix = '2_large'
279+
280+
method = f'{cls}.{prefix}{postfix}'
225281
return pkg, method
226282

227283

@@ -240,7 +296,7 @@ def __call__(self, executor, output_file):
240296
cmd = [
241297
'stdbuf', '-o0', '-e0', '--',
242298
'python', '-m', 'fathom.cli',
243-
'--workload', self.wl.name.rsplit('eval')[0],
299+
'--workload', remove_suffix(self.wl.name, 'eval'),
244300
'--action', 'test' if self.wl.name.endswith('eval') else 'train',
245301
'--num_iters', str(self.wl.batch_num),
246302
'--batch_size', str(self.wl.batch_size),
@@ -262,9 +318,153 @@ def __call__(self, executor, output_file):
262318
else:
263319
raise ValueError(f'Unknown executor: {executor}')
264320

321+
cmd += self.wl.extra_args
322+
logger.info(f'Starting workload with cmd: {cmd}')
323+
324+
if FLAGS.no_capture:
325+
return execute(cmd, cwd=str(cwd), env=self.env)
326+
else:
327+
output_file.parent.mkdir(exist_ok=True, parents=True)
328+
with output_file.open('w') as f:
329+
return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
330+
331+
332+
class TFWebDirectRunner(Runner):
333+
"""Using TFWeb's load infrastructure to directly run"""
334+
335+
def __init__(self, wl, base_dir=None):
336+
super().__init__(wl)
337+
self.base_dir = base_dir
338+
if self.base_dir is None:
339+
self.base_dir = FLAGS.tfweb_base
340+
341+
def __call__(self, executor, output_file):
342+
model_name = remove_suffix(self.wl.name, 'eval')
343+
cwd = self.base_dir
344+
cmd = [
345+
'stdbuf', '-o0', '-e0', '--',
346+
'examples/direct/client',
347+
'--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))),
348+
'--batch_size={}'.format(self.wl.batch_size),
349+
'--batch_num={}'.format(self.wl.batch_num),
350+
]
351+
352+
if executor == Executor.Salus:
353+
cmd += [
354+
'--sess_target', SalusServer.current_server().endpoint,
355+
]
356+
elif executor == Executor.TF:
357+
cmd += [
358+
'--sess_target', '""',
359+
]
360+
elif executor == Executor.TFDist:
361+
cmd += [
362+
'--sess_target', TFDistServer.current_server().endpoint,
363+
]
364+
else:
365+
raise ValueError(f'Unknown executor: {executor}')
366+
cmd += self.wl.extra_args
367+
logger.info(f'Starting workload with cmd: {cmd}')
368+
265369
if FLAGS.no_capture:
266370
return execute(cmd, cwd=str(cwd), env=self.env)
267371
else:
268372
output_file.parent.mkdir(exist_ok=True, parents=True)
269373
with output_file.open('w') as f:
270374
return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
375+
376+
377+
class TFWebRunner(Runner):
378+
"""
379+
Run a TFWeb based inference job
380+
381+
We start several servers and a balancer on the same node.
382+
The server commandline: tfweb --model=path/to/saved_model/network --sess_target=...
383+
The client commandline: gobetween from-file xxx.toml
384+
"""
385+
386+
def __init__(self, wl, base_dir=None):
387+
super().__init__(wl)
388+
self.base_dir = base_dir
389+
if self.base_dir is None:
390+
self.base_dir = FLAGS.tfweb_base
391+
392+
def __call__(self, executor, output_file):
393+
# type: (Executor, Path) -> Popen
394+
model_name = remove_suffix(self.wl.name, 'web')
395+
cwd = self.base_dir
396+
cmd = [
397+
'stdbuf', '-o0', '-e0', '--',
398+
'examples/cluster/start_cluster',
399+
'--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))),
400+
]
401+
402+
if executor == Executor.Salus:
403+
cmd += [
404+
'--sess_target', SalusServer.current_server().endpoint,
405+
]
406+
elif executor == Executor.TF:
407+
cmd += [
408+
'--sess_target', '""',
409+
]
410+
elif executor == Executor.TFDist:
411+
cmd += [
412+
'--sess_target', TFDistServer.current_server().endpoint,
413+
]
414+
else:
415+
raise ValueError(f'Unknown executor: {executor}')
416+
417+
num_replicas = self.wl.env.pop('SALUS_TFWEB_REPLICAS', '1')
418+
cmd += [
419+
'--num_replicas', num_replicas
420+
]
421+
cmd += self.wl.extra_args
422+
logger.info(f'Starting workload with cmd: {cmd}')
423+
424+
if FLAGS.no_capture:
425+
return execute(cmd, cwd=str(cwd), env=self.env)
426+
else:
427+
output_file.parent.mkdir(exist_ok=True, parents=True)
428+
with output_file.open('w') as f:
429+
return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
430+
431+
432+
class TFWebClientRunner(Runner):
433+
"""
434+
Run a tfweb client attacker.
435+
Command: examples/cluster/tfweb-client TARGET REQ_BODY PLANTXT
436+
"""
437+
438+
def __init__(self, wl, base_dir=None):
439+
super().__init__(wl)
440+
self.base_dir = base_dir
441+
if self.base_dir is None:
442+
self.base_dir = FLAGS.tfweb_base
443+
444+
def __call__(self, executor, output_file):
445+
# type: (Executor, Path) -> Popen
446+
447+
model_name = remove_suffix(self.wl.name, 'client')
448+
449+
cwd = self.base_dir
450+
cmd = [
451+
'stdbuf', '-o0', '-e0', '--',
452+
'examples/tfweb-client',
453+
'-output', str(output_file),
454+
self.wl.target,
455+
# request body
456+
str(Path(FLAGS.tfweb_request_body_dir).joinpath(model_name).with_suffix('.txt')),
457+
# always write plan to stdin
458+
'-',
459+
]
460+
cmd += self.wl.extra_args
461+
logger.info(f'Starting workload with cmd: {cmd}')
462+
463+
proc = execute(cmd, cwd=str(cwd), env=self.env, stdin=sp.PIPE)
464+
proc.stdin.write(self._plan_to_bytes())
465+
proc.stdin.close()
466+
return proc
467+
468+
def _plan_to_bytes(self):
469+
return ' '.join(self.wl.plan).encode('utf-8')
470+

benchmarks/driver/server/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,11 @@ def _find_executable(self):
7777
"""Find the absolute path to server executable, according to 'config.build_type'"""
7878
candidates = [
7979
self.config.build_dir / self.config.build_type / 'src' / 'executor',
80+
self.config.build_dir / self.config.build_type / 'src' / 'salus-server',
8081
self.config.build_dir / self.config.build_type / 'bin' / 'executor',
8182
self.config.build_dir / self.config.build_type / 'bin' / 'salus-server',
8283
self.config.build_dir / self.config.build_type.lower() / 'src' / 'executor',
84+
self.config.build_dir / self.config.build_type.lower() / 'src' / 'salus-server',
8385
self.config.build_dir / self.config.build_type.lower() / 'bin' / 'executor',
8486
self.config.build_dir / self.config.build_type.lower() / 'bin' / 'salus-server',
8587
]

0 commit comments

Comments
 (0)