-
Notifications
You must be signed in to change notification settings - Fork 43
Description
We are trying to run RNNT benchmark on our DGX station(https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/dgx-station/368040-DGX-Station-DS-R11.pdf). Please help to set the right config parameters. Here are our logs after executing "CONT=mlperf-nvidia:rnn_speech_recognition-pytorch DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> METADATA_DIR=<path/to/metadata/dir> SENTENCEPIECES_DIR=<path/to/sentencepieces/dir> bash ./run_with_docker.sh" command:
- : baseline_DGXA100_8x8x32x1
- : mlperf-nvidia:rnn_speech_recognition-pytorch
- : 10
++ date +%y%m%d%H%M%S%N - : 211008164556434440824
- : 1
- : /raid/speech_processing/pytorch/datasets
- : /raid/speech_processing/pytorch/results
- : ./api_logs
- : 17584
- : 01:00:00
- readonly _config_file=./config_baseline_DGXA100_8x8x32x1.sh
- _config_file=./config_baseline_DGXA100_8x8x32x1.sh
- readonly _logfile_base=/raid/speech_processing/pytorch/results/result_211008164556434440824
- _logfile_base=/raid/speech_processing/pytorch/results/result_211008164556434440824
- readonly _cont_name=rnn_speech_recognition
- _cont_name=rnn_speech_recognition
- _cont_mounts=("--volume=${DATADIR}:/datasets/" "--volume=${LOGDIR}:/results" "--volume=${METADATA_DIR}:/metadata" "--volume=${SENTENCEPIECES_DIR}:/sentencepieces")
- '[' '' -eq 1 ']'
./run_with_docker.sh: line 30: [: : integer expression expected
++ source /etc/os-release
+++ NAME=Ubuntu
+++ VERSION='18.04.5 LTS (Bionic Beaver)'
+++ ID=ubuntu
+++ ID_LIKE=debian
+++ PRETTY_NAME='Ubuntu 18.04.5 LTS'
+++ VERSION_ID=18.04
+++ HOME_URL=https://www.ubuntu.com/
+++ SUPPORT_URL=https://help.ubuntu.com/
+++ BUG_REPORT_URL=https://bugs.launchpad.net/ubuntu/
+++ PRIVACY_POLICY_URL=https://www.ubuntu.com/legal/terms-and-policies/privacy-policy
+++ VERSION_CODENAME=bionic
+++ UBUNTU_CODENAME=bionic
++ source /etc/dgx-release
+++ DGX_NAME='DGX Station'
+++ DGX_PRETTY_NAME='NVIDIA DGX Station'
+++ DGX_SWBUILD_DATE=2017-10-31
+++ DGX_SWBUILD_VERSION=3.1.3
+++ DGX_COMMIT_ID=31e745794370d852fdb0a178ef022a872f58efdf
+++ DGX_SERIAL_NUMBER=0154917000004
+++ DGX_OTA_VERSION=3.1.4
+++ DGX_OTA_DATE='Wed Jan 31 15:04:20 IST 2018'
+++ DGX_OTA_VERSION=3.1.7
+++ DGX_OTA_DATE='Tue Nov 27 15:28:38 IST 2018'
+++ DGX_OTA_VERSION=4.0.4
+++ DGX_OTA_DATE='Thu Dec 13 15:09:08 IST 2018'
+++ DGX_OTA_VERSION=4.0.6
+++ DGX_OTA_DATE='Wed Aug 7 19:13:38 IST 2019'
+++ DGX_OTA_VERSION=4.0.7
+++ DGX_OTA_DATE='Mon Sep 14 09:48:51 IST 2020'
++ echo 'Ubuntu 18.04.5 LTS / NVIDIA DGX Station 4.0.7' - MLPERF_HOST_OS='Ubuntu 18.04.5 LTS / NVIDIA DGX Station 4.0.7'
- export MLPERF_HOST_OS
- mkdir -p /raid/speech_processing/pytorch/results
- source ./config_baseline_DGXA100_8x8x32x1.sh
++ export DGXNNODES=8
++ DGXNNODES=8
+++ sed 's/^config_//'
+++ sed 's/.sh$//'
++++ readlink -f ./config_baseline_DGXA100_8x8x32x1.sh
+++ basename /raid/speech_processing/pytorch/config_baseline_DGXA100_8x8x32x1.sh
++ export DGXSYSTEM=baseline_DGXA100_8x8x32x1
++ DGXSYSTEM=baseline_DGXA100_8x8x32x1
++ export DGXNGPU=8
++ DGXNGPU=8
++ export DGXSOCKETCORES=24
++ DGXSOCKETCORES=24
++ export DGXNSOCKET=2
++ DGXNSOCKET=2
++ export DGXHT=2
++ DGXHT=2
++ export GRAD_ACCUMULATION_STEPS=1
++ GRAD_ACCUMULATION_STEPS=1
++ export DATADIR=/raid/datasets/rnnt/LibriSpeech/
++ DATADIR=/raid/datasets/rnnt/LibriSpeech/
++ export BATCHSIZE=32
++ BATCHSIZE=32
++ export EVAL_BATCHSIZE=2
++ EVAL_BATCHSIZE=2
++ export WALLTIME=01:00:00
++ WALLTIME=01:00:00
++ export VAL_FREQUENCY=1
++ VAL_FREQUENCY=1
++ export MAX_SYMBOL=300
++ MAX_SYMBOL=300
++ export EPOCH=90
++ EPOCH=90
++ export SEED=23975
++ SEED=23975
++ export LR=0.007
++ LR=0.007
++ export WEIGHTS_INIT_SCALE=0.5
++ WEIGHTS_INIT_SCALE=0.5
++ export DATA_CPU_THREADS=8
++ DATA_CPU_THREADS=8 - mapfile -t _config_env
++ env -i bash -c '. ./config_baseline_DGXA100_8x8x32x1.sh && compgen -e'
++ grep -E -v '^(PWD|SHLVL)' - _config_env+=(MLPERF_HOST_OS)
- mapfile -t _config_env
++ for v in "${_config_env[@]}"
++ echo --env=BATCHSIZE
++ for v in "${_config_env[@]}"
++ echo --env=DATADIR
++ for v in "${_config_env[@]}"
++ echo --env=DATA_CPU_THREADS
++ for v in "${_config_env[@]}"
++ echo --env=DGXHT
++ for v in "${_config_env[@]}"
++ echo --env=DGXNGPU
++ for v in "${_config_env[@]}"
++ echo --env=DGXNNODES
++ for v in "${_config_env[@]}"
++ echo --env=DGXNSOCKET
++ for v in "${_config_env[@]}"
++ echo --env=DGXSOCKETCORES
++ for v in "${_config_env[@]}"
++ echo --env=DGXSYSTEM
++ for v in "${_config_env[@]}"
++ echo --env=EPOCH
++ for v in "${_config_env[@]}"
++ echo --env=EVAL_BATCHSIZE
++ for v in "${_config_env[@]}"
++ echo --env=GRAD_ACCUMULATION_STEPS
++ for v in "${_config_env[@]}"
++ echo --env=LR
++ for v in "${_config_env[@]}"
++ echo --env=MAX_SYMBOL
++ for v in "${_config_env[@]}"
++ echo --env=SEED
++ for v in "${_config_env[@]}"
++ echo --env=VAL_FREQUENCY
++ for v in "${_config_env[@]}"
++ echo --env=WALLTIME
++ for v in "${_config_env[@]}"
++ echo --env=WEIGHTS_INIT_SCALE
++ for v in "${_config_env[@]}"
++ echo --env=MLPERF_HOST_OS - docker run --rm --init --detach --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --name=rnn_speech_recognition --volume=/raid/speech_processing/pytorch/datasets:/datasets/ --volume=/raid/speech_processing/pytorch/results:/results --volume=/raid/speech_processing/pytorch/tokenized:/metadata --volume=/raid/speech_processing/pytorch/sentenpieces:/sentencepieces mlperf-nvidia:rnn_speech_recognition-pytorch sleep infinity
0a6482c870f1468836b708d3f340a3dfec7b49ef2b1c352b2c5bf588803cf29c - docker exec -it rnn_speech_recognition true
- [[ baseline_DGXA100_8x8x32x1 == \D\G\X\A\1\0\0* ]]
++ seq 1 10 - for _experiment_index in
$(seq 1 "$ {NEXP}") - tee /raid/speech_processing/pytorch/results/result_211008164556434440824_1.txt
tee: /raid/speech_processing/pytorch/results/result_211008164556434440824_1.txt: Permission denied - echo 'Beginning trial 1 of 10'
Beginning trial 1 of 10 - docker exec -it rnn_speech_recognition python -c ''
- '[' 1 -eq 1 ']'
- sync
- docker exec -it rnn_speech_recognition python -c '
from mlperf import logging
logging.log_event(key=logging.constants.CACHE_CLEAR, value=True)'
:::MLLOG {"namespace": "", "time_ms": 1633691758520, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "", "lineno": 3}} - docker exec -it --env=BATCHSIZE --env=DATADIR --env=DATA_CPU_THREADS --env=DGXHT --env=DGXNGPU --env=DGXNNODES --env=DGXNSOCKET --env=DGXSOCKETCORES --env=DGXSYSTEM --env=EPOCH --env=EVAL_BATCHSIZE --env=GRAD_ACCUMULATION_STEPS --env=LR --env=MAX_SYMBOL --env=SEED --env=VAL_FREQUENCY --env=WALLTIME --env=WEIGHTS_INIT_SCALE --env=MLPERF_HOST_OS rnn_speech_recognition ./run_and_time.sh
./run_and_time.sh: line 24: [: : integer expression expected
STARTING TIMING RUN AT 2021-10-08 11:15:58 AM
running benchmark
python -u -m bind_launch --nsockets_per_node=2 --ncores_per_socket=24 --nproc_per_node=8
./run_and_time.sh: line 140: [: -ne: unary operator expected
libnuma: Warning: cpu argument 48-53 is out of range
<0-5,48-53> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 54-59 is out of range
<6-11,54-59> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 60-65 is out of range
<12-17,60-65> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 66-71 is out of range
<18-23,66-71> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 72-77 is out of range
<24-29,72-77> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 78-83 is out of range
<30-35,78-83> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 41,84-89 out of range
<36-41,84-89> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
libnuma: Warning: cpu argument 42-47,90-95 is out of range
<42-47,90-95> is invalid
usage: numactl [--all | -a] [--interleave= | -i ] [--preferred= | -p ]
[--physcpubind= | -C ] [--cpunodebind= | -N ]
[--membind= | -m ] [--localalloc | -l] command args ...
numactl [--show | -s]
numactl [--hardware | -H]
numactl [--length | -l ] [--offset | -o ] [--shmmode | -M ]
[--strict | -t]
[--shmid | -I ] --shm | -S
[--shmid | -I ] --file | -f
[--huge | -u] [--touch | -T]
memory policy | --dump | -d | --dump-nodes | -D
memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l
is a comma delimited list of node numbers or A-B ranges or all.
Instead of a number a node can also be:
netdev:DEV the node connected to network device DEV
file:PATH the node the block device of path is connected to
ip:HOST the node of the network device host routes through
block:PATH the node of block device path
pci:[seg:]bus:dev[:func] The node of a PCI device
is a comma delimited list of cpu numbers or A-B ranges or all
all ranges can be inverted with !
all numbers and ranges can be made cpuset-relative with +
the old --cpubind argument is deprecated.
use --cpunodebind or --physcpubind instead
can have g (GB), m (MB) or k (KB) suffixes
ENDING TIMING RUN AT 2021-10-08 11:16:02 AM
RESULT,RNN_SPEECH_RECOGNITION,,4,nvidia,2021-10-08 11:15:58 AM