Skip to content

Commit 7f23f38

Browse files
authored
3053 release *_dist.py tests memory to avoid OOM (#3537)
* adds min. memory testing utils Signed-off-by: Wenqi Li <[email protected]> * include valueerror for robust outcome Signed-off-by: Wenqi Li <[email protected]> * ensure float Signed-off-by: Wenqi Li <[email protected]> * msg improvements Signed-off-by: Wenqi Li <[email protected]> * update threshold Signed-off-by: Wenqi Li <[email protected]> * remove ref Signed-off-by: Wenqi Li <[email protected]> * separate disttests Signed-off-by: Wenqi Li <[email protected]> * update based on comments Signed-off-by: Wenqi Li <[email protected]>
1 parent 21c5f6d commit 7f23f38

File tree

7 files changed

+27
-13
lines changed

7 files changed

+27
-13
lines changed

.github/pull_request_template.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
1212
- [ ] Breaking change (fix or new feature that would cause existing functionality to change).
1313
- [ ] New tests added to cover the changes.
1414
- [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
15-
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
15+
- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`.
1616
- [ ] In-line docstrings updated.
1717
- [ ] Documentation updated, tested `make html` command in the `docs/` folder.

.github/workflows/cron.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
4949
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
5050
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
51-
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
51+
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
5252
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
5353
coverage xml
5454
if pgrep python; then pkill python; fi
@@ -91,7 +91,7 @@ jobs:
9191
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
9292
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
9393
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
94-
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
94+
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
9595
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
9696
coverage xml
9797
if pgrep python; then pkill python; fi
@@ -190,7 +190,7 @@ jobs:
190190
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
191191
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
192192
ngc --version
193-
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests # unit tests with pytype checks, coverage report
193+
BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report
194194
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
195195
coverage xml
196196
if pgrep python; then pkill python; fi

.github/workflows/integration.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
4848
python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
4949
BUILD_MONAI=1 ./runtests.sh --net
50-
BUILD_MONAI=1 ./runtests.sh --unittests
50+
BUILD_MONAI=1 ./runtests.sh --unittests --disttests
5151
if pgrep python; then pkill python; fi
5252
shell: bash
5353
- name: Add reaction

.github/workflows/pythonapp-gpu.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
125125
python -c "import monai; monai.config.print_config()"
126126
# build for the current self-hosted CI Tesla V100
127-
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
127+
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
128128
if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
129129
# test the clang-format tool downloading once
130130
coverage run -m tests.clang_format_utils

.github/workflows/setupapp.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
6060
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
6161
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
62-
BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report
62+
BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report
6363
BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report
6464
coverage xml
6565
if pgrep python; then pkill python; fi
@@ -104,7 +104,7 @@ jobs:
104104
run: |
105105
python -m pip list
106106
python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
107-
BUILD_MONAI=1 ./runtests.sh --quick --unittests
107+
BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
108108
coverage xml
109109
- name: Upload coverage
110110
uses: codecov/codecov-action@v1

runtests.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -567,15 +567,19 @@ if [ $doUnitTests = true ]
567567
then
568568
echo "${separator}${blue}unittests${noColor}"
569569
torch_validate
570-
${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
570+
${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$" # excluding integration/dist tests
571571
fi
572572

573573
# distributed test only
574574
if [ $doDistTests = true ]
575575
then
576576
echo "${separator}${blue}run distributed unit test cases${noColor}"
577577
torch_validate
578-
${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
578+
for i in tests/test_*_dist.py
579+
do
580+
echo "$i"
581+
${cmdPrefix}${cmd} "$i"
582+
done
579583
fi
580584

581585
# network training/inference/eval integration tests

tests/utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,7 @@ def run_process(self, func, local_rank, args, kwargs, results):
371371
os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)
372372

373373
if torch.cuda.is_available():
374-
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
375-
torch.cuda.set_device(int(local_rank))
374+
torch.cuda.set_device(int(local_rank)) # using device ids from CUDA_VISIBILE_DEVICES
376375

377376
dist.init_process_group(
378377
backend=self.backend,
@@ -427,6 +426,7 @@ def _wrapper(*args, **kwargs):
427426
for p in processes:
428427
p.join()
429428
assert results.get(), "Distributed call failed."
429+
_del_original_func(obj)
430430

431431
return _wrapper
432432

@@ -508,6 +508,7 @@ def _wrapper(*args, **kwargs):
508508
finally:
509509
p.join()
510510

511+
_del_original_func(obj)
511512
res = None
512513
try:
513514
res = results.get(block=False)
@@ -533,6 +534,15 @@ def _cache_original_func(obj) -> None:
533534
_original_funcs[obj.__name__] = obj
534535

535536

537+
def _del_original_func(obj):
538+
"""pop the original function from cache."""
539+
global _original_funcs
540+
_original_funcs.pop(obj.__name__, None)
541+
if torch.cuda.is_available(): # clean up the cached function
542+
torch.cuda.synchronize()
543+
torch.cuda.empty_cache()
544+
545+
536546
def _call_original_func(name, module, *args, **kwargs):
537547
if name not in _original_funcs:
538548
_original_module = importlib.import_module(module) # reimport, refresh _original_funcs
@@ -621,7 +631,7 @@ def test_script_save(net, *inputs, device=None, rtol=1e-4, atol=0.0):
621631

622632
def query_memory(n=2):
623633
"""
624-
Find best n idle devices and return a string of device ids.
634+
Find best n idle devices and return a string of device ids using the `nvidia-smi` command.
625635
"""
626636
bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits"
627637

0 commit comments

Comments
 (0)