3053 release *_dist.py tests memory to avoid OOM (#3537)

wyli · web-flow · commit 7f23f38e7e74 · 2021-12-24T10:05:39.000Z
* adds min. memory testing utils

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* include valueerror for robust outcome

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* ensure float

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* msg improvements

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* update threshold

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* remove ref

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* separate disttests

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;

* update based on comments

Signed-off-by: Wenqi Li &lt;wenqil@nvidia.com&gt;
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
 - [ ] Breaking change (fix or new feature that would cause existing functionality to change).
 - [ ] New tests added to cover the changes.
 - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
-- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
+- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests  --disttests`.
 - [ ] In-line docstrings updated.
 - [ ] Documentation updated, tested `make html` command in the `docs/` folder.
diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -48,7 +48,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -91,7 +91,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -190,7 +190,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         ngc --version
-        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests  # unit tests with pytype checks, coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests  # unit tests with pytype checks, coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -47,7 +47,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         BUILD_MONAI=1 ./runtests.sh --net
-        BUILD_MONAI=1 ./runtests.sh --unittests
+        BUILD_MONAI=1 ./runtests.sh --unittests --disttests
         if pgrep python; then pkill python; fi
       shell: bash
     - name: Add reaction
diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
@@ -124,7 +124,7 @@ jobs:
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
         python -c "import monai; monai.config.print_config()"
         # build for the current self-hosted CI Tesla V100
-        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
         if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
           # test the clang-format tool downloading once
           coverage run -m tests.clang_format_utils
diff --git a/.github/workflows/setupapp.yml b/.github/workflows/setupapp.yml
@@ -59,7 +59,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -104,7 +104,7 @@ jobs:
       run: |
         python -m pip list
         python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
-        BUILD_MONAI=1 ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
         coverage xml
     - name: Upload coverage
       uses: codecov/codecov-action@v1
diff --git a/runtests.sh b/runtests.sh
@@ -567,15 +567,19 @@ if [ $doUnitTests = true ]
 then
     echo "${separator}${blue}unittests${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
+    ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$"  # excluding integration/dist tests
 fi
 
 # distributed test only
 if [ $doDistTests = true ]
 then
     echo "${separator}${blue}run distributed unit test cases${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
+    for i in tests/test_*_dist.py
+    do
+        echo "$i"
+        ${cmdPrefix}${cmd} "$i"
+    done
 fi
 
 # network training/inference/eval integration tests
diff --git a/tests/utils.py b/tests/utils.py
@@ -371,8 +371,7 @@ def run_process(self, func, local_rank, args, kwargs, results):
             os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)
 
             if torch.cuda.is_available():
-                os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-                torch.cuda.set_device(int(local_rank))
+                torch.cuda.set_device(int(local_rank))  # using device ids from CUDA_VISIBILE_DEVICES
 
             dist.init_process_group(
                 backend=self.backend,
@@ -427,6 +426,7 @@ def _wrapper(*args, **kwargs):
             for p in processes:
                 p.join()
                 assert results.get(), "Distributed call failed."
+            _del_original_func(obj)
 
         return _wrapper
 
@@ -508,6 +508,7 @@ def _wrapper(*args, **kwargs):
             finally:
                 p.join()
 
+            _del_original_func(obj)
             res = None
             try:
                 res = results.get(block=False)
@@ -533,6 +534,15 @@ def _cache_original_func(obj) -> None:
     _original_funcs[obj.__name__] = obj
 
 
+def _del_original_func(obj):
+    """pop the original function from cache."""
+    global _original_funcs
+    _original_funcs.pop(obj.__name__, None)
+    if torch.cuda.is_available():  # clean up the cached function
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+
+
 def _call_original_func(name, module, *args, **kwargs):
     if name not in _original_funcs:
         _original_module = importlib.import_module(module)  # reimport, refresh _original_funcs
@@ -621,7 +631,7 @@ def test_script_save(net, *inputs, device=None, rtol=1e-4, atol=0.0):
 
 def query_memory(n=2):
     """
-    Find best n idle devices and return a string of device ids.
+    Find best n idle devices and return a string of device ids using the `nvidia-smi` command.
     """
     bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits"