Skip to content

Commit d86d1bd

Browse files
authored
Refactor Zero Code Change Test Directory (aws#215)
1 parent 1b4671b commit d86d1bd

File tree

6 files changed

+54
-37
lines changed

6 files changed

+54
-37
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
HOROVOD_KERAS_TEST_SCRIPT_PATH = "horovod_keras_mnist.py"
2+
HOROVOD_KERAS_TEST_SCRIPT_ARGS = ["--num_epochs", "2"]
3+
4+
HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH = "horovod_estimator_mnist.py"
5+
HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS = ["--num_steps", "1000"]

tests/zero_code_change/horovod_tests/pytorch/__init__.py

Whitespace-only changes.

tests/zero_code_change/horovod_tests/tensorflow/test_estimator.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,36 @@
22
import os
33

44
# Third Party
5+
from tests.zero_code_change.horovod_tests.constants import (
6+
HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS,
7+
HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH,
8+
)
9+
from tests.zero_code_change.horovod_tests.utils import launch_horovod_job
510
from tests.zero_code_change.utils import build_json
611

712
# First Party
813
from smdebug.trials import create_trial
914

1015
# Local
11-
from .utils import get_available_gpus, launch_horovod_job
16+
from .utils import get_available_gpus
17+
1218

1319
"""
1420
Tested on current DLAMI p3.8xlarge when run from the main directory
1521
"""
1622

17-
HOROVOD_MNIST_SCRIPT_NAME = "horovod_estimator_mnist.py"
18-
HOROVOD_MNIST_ARGS = ["--num_steps", "1000"]
19-
2023

2124
def basic_test(out_dir, mode):
2225
path = build_json(out_dir, include_workers="one", include_collections=["weights", "gradients"])
2326
num_workers = len(get_available_gpus())
24-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", os.path.join(out_dir, "checkpoint")]
27+
mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [
28+
"--model_dir",
29+
os.path.join(out_dir, "checkpoint"),
30+
]
2531
if mode == "cpu":
2632
mode_args += ["--use_only_cpu", "true"]
2733
launch_horovod_job(
28-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
34+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}",
2935
script_args=mode_args,
3036
num_workers=num_workers,
3137
config_file_path=path,
@@ -50,11 +56,14 @@ def test_gpu(out_dir):
5056
def mode_allworkers(out_dir, mode):
5157
path = build_json(out_dir, include_workers="all", include_collections=["weights", "gradients"])
5258
num_workers = len(get_available_gpus())
53-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", os.path.join(out_dir, "checkpoint")]
59+
mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [
60+
"--model_dir",
61+
os.path.join(out_dir, "checkpoint"),
62+
]
5463
if mode == "cpu":
5564
mode_args += ["--use_only_cpu", "true"]
5665
launch_horovod_job(
57-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
66+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}",
5867
script_args=mode_args,
5968
num_workers=num_workers,
6069
config_file_path=path,
@@ -79,11 +88,14 @@ def mode_allworkers_saveall(out_dir, mode):
7988
out_dir, include_workers="all", save_all=True, include_collections=["weights", "gradients"]
8089
)
8190
num_workers = len(get_available_gpus())
82-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", os.path.join(out_dir, "checkpoint")]
91+
mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [
92+
"--model_dir",
93+
os.path.join(out_dir, "checkpoint"),
94+
]
8395
if mode == "cpu":
8496
mode_args += ["--use_only_cpu", "true"]
8597
launch_horovod_job(
86-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
98+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}",
8799
script_args=mode_args,
88100
num_workers=num_workers,
89101
config_file_path=path,

tests/zero_code_change/horovod_tests/tensorflow/test_keras.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,31 @@
11
# First Party
22
# Third Party
3+
from tests.zero_code_change.horovod_tests.constants import (
4+
HOROVOD_KERAS_TEST_SCRIPT_ARGS,
5+
HOROVOD_KERAS_TEST_SCRIPT_PATH,
6+
)
7+
from tests.zero_code_change.horovod_tests.utils import launch_horovod_job
38
from tests.zero_code_change.utils import build_json
49

510
from smdebug.trials import create_trial
611

712
# Local
8-
from .utils import get_available_gpus, launch_horovod_job
13+
from .utils import get_available_gpus
14+
915

1016
"""
1117
Tested on current DLAMI p3.8xlarge
1218
"""
1319

14-
HOROVOD_MNIST_SCRIPT_NAME = "horovod_keras_mnist.py"
15-
HOROVOD_MNIST_ARGS = ["--num_epochs", "2"]
16-
1720

1821
def basic_test(out_dir, mode):
1922
path = build_json(out_dir, include_workers="one", include_collections=["weights", "gradients"])
2023
num_workers = len(get_available_gpus())
21-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", out_dir]
24+
mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir]
2225
if mode == "cpu":
2326
mode_args += ["--use_only_cpu", "true"]
2427
launch_horovod_job(
25-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
28+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_KERAS_TEST_SCRIPT_PATH}",
2629
script_args=mode_args,
2730
num_workers=num_workers,
2831
config_file_path=path,
@@ -47,11 +50,11 @@ def test_gpu(out_dir):
4750
def mode_allworkers(out_dir, mode):
4851
path = build_json(out_dir, include_workers="all", include_collections=["weights", "gradients"])
4952
num_workers = len(get_available_gpus())
50-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", out_dir]
53+
mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir]
5154
if mode == "cpu":
5255
mode_args += ["--use_only_cpu", "true"]
5356
launch_horovod_job(
54-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
57+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_KERAS_TEST_SCRIPT_PATH}",
5558
script_args=mode_args,
5659
num_workers=num_workers,
5760
config_file_path=path,
@@ -76,11 +79,11 @@ def mode_allworkers_saveall(out_dir, mode):
7679
out_dir, include_workers="all", save_all=True, include_collections=["weights", "gradients"]
7780
)
7881
num_workers = len(get_available_gpus())
79-
mode_args = list(HOROVOD_MNIST_ARGS) + ["--model_dir", out_dir]
82+
mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir]
8083
if mode == "cpu":
8184
mode_args += ["--use_only_cpu", "true"]
8285
launch_horovod_job(
83-
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
86+
script_file_path=f"examples/tensorflow/sagemaker_official_container/{HOROVOD_KERAS_TEST_SCRIPT_PATH}",
8487
script_args=mode_args,
8588
num_workers=num_workers,
8689
config_file_path=path,
Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,7 @@
1-
# Standard Library
2-
import os
3-
import subprocess
4-
import sys
5-
61
# Third Party
72
from tensorflow.python.client import device_lib
83

94

105
def get_available_gpus():
116
local_device_protos = device_lib.list_local_devices()
127
return [x.name for x in local_device_protos if x.device_type == "GPU"]
13-
14-
15-
def launch_horovod_job(script_file_path, script_args, num_workers, config_file_path, mode):
16-
command = (
17-
["horovodrun", "-np", str(num_workers)] + [sys.executable, script_file_path] + script_args
18-
)
19-
env_dict = os.environ.copy()
20-
env_dict["SMDEBUG_CONFIG_FILE_PATH"] = f"{config_file_path}"
21-
env_dict["PYTHONPATH"] = "/home/ubuntu/sagemaker-debugger/"
22-
if mode == "cpu":
23-
env_dict["CUDA_VISIBLE_DEVICES"] = "-1"
24-
subprocess.check_call(command, env=env_dict)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Standard Library
2+
import os
3+
import subprocess
4+
import sys
5+
6+
7+
def launch_horovod_job(script_file_path, script_args, num_workers, config_file_path, mode):
8+
command = ["mpirun", "-np", str(num_workers)] + [sys.executable, script_file_path] + script_args
9+
env_dict = os.environ.copy()
10+
env_dict["SMDEBUG_CONFIG_FILE_PATH"] = f"{config_file_path}"
11+
env_dict["PYTHONPATH"] = "/home/ubuntu/sagemaker-debugger/"
12+
if mode == "cpu":
13+
env_dict["CUDA_VISIBLE_DEVICES"] = "-1"
14+
subprocess.check_call(command, env=env_dict)

0 commit comments

Comments
 (0)