Move help text output regarding PSM2_CUDA envvar to component init phase

aravindksg · aravindksg · commit 2719f26cc7da · 2017-10-27T10:29:33.000-07:00
The messages should be printed only in the event of CUDA builds and in the presence of supporting hardware and when PSM2 MTL has actually been selected for use. To this end, move help text output to component init phase. Also use opal_setenv/unsetenv() for safer setting, unsetting of the environment variable and sanitize the help text message. Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com> (cherry picked from commit bea4503) Conflicts: ompi/mca/mtl/psm2/mtl_psm2_component.c
diff --git a/ompi/mca/mtl/psm2/help-mtl-psm2.txt b/ompi/mca/mtl/psm2/help-mtl-psm2.txt
@@ -1,7 +1,7 @@
 # -*- text -*-
 #
 # Copyright (C) 2009. QLogic Corporation.  All rights reserved.
-# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
+# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -47,5 +47,17 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
 Message size %llu bigger than supported by PSM2 API. Max = %llu
 #
 [no psm2 cuda env]
-Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
-This is not a recommended combination. If the application uses %s.
+Warning: Open MPI has detected that you are running in an environment with CUDA
+devices present and that you are using Intel(r) Ompi-Path networking. However,
+the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
+networking library was not told how to handle CUDA support.
+
+If your application uses CUDA buffers, you should set the environment variable
+PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
+can have performance implications on your application, or even cause it to
+crash.
+
+Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
+environment variable to 1.
+
+Local hostname: %s
diff --git a/ompi/mca/mtl/psm2/mtl_psm2_component.c b/ompi/mca/mtl/psm2/mtl_psm2_component.c
@@ -11,9 +11,9 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
- * Copyright (c) 2012-2015 Los Alamos National Security, LLC.
- *                         All rights reserved.
- * Copyright (c) 2013-2016 Intel, Inc. All rights reserved
+ * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
@@ -28,6 +28,7 @@
 #include "opal/mca/event/event.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
+#include "opal/util/opal_environ.h"
 #include "ompi/proc/proc.h"
 
 #include "mtl_psm2.h"
@@ -43,6 +44,10 @@
 
 static int param_priority;
 
+#if OPAL_CUDA_SUPPORT
+static bool cuda_envvar_set = false;
+#endif
+
 static int ompi_mtl_psm2_component_open(void);
 static int ompi_mtl_psm2_component_close(void);
 static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
@@ -80,10 +85,6 @@ mca_mtl_psm2_component_t mca_mtl_psm2_component = {
 static int
 ompi_mtl_psm2_component_register(void)
 {
-#if OPAL_CUDA_SUPPORT
-    char *cuda_env;
-#endif
-
     ompi_mtl_psm2.connect_timeout = 180;
     (void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
                                            "connect_timeout",
@@ -95,29 +96,6 @@ ompi_mtl_psm2_component_register(void)
 
     /* set priority high enough to beat ob1's default (also set higher than psm) */
     param_priority = 40;
-#if OPAL_CUDA_SUPPORT
-    /*
-     * If using CUDA enabled OpenMPI, the user likely intends to
-     * run with CUDA buffers. So, force-set the envvar here if user failed
-     * to set it.
-     */
-    cuda_env = getenv("PSM2_CUDA");
-    if (!cuda_env) {
-        opal_show_help("help-mtl-psm2.txt",
-                       "no psm2 cuda env", true,
-                       "not set",
-                       "Host buffers,\nthere will be a performance penalty"
-                       " due to OMPI force setting this variable now.\n"
-                       "Set environment variable to 0 if using Host buffers" );
-        setenv("PSM2_CUDA", "1", 0);
-    } else if (strcmp(cuda_env, "0") == 0) {
-        opal_show_help("help-mtl-psm2.txt",
-                       "no psm2 cuda env", true,
-                       "set to 0",
-                       "CUDA buffers,\nthe execution will SEGFAULT."
-                       " Set environment variable to 1 if using CUDA buffers");
-    }
-#endif
 
     (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
                                             "priority", "Priority of the PSM2 MTL component",
@@ -133,17 +111,16 @@ static int
 ompi_mtl_psm2_component_open(void)
 {
   int res;
-  glob_t globbuf;
-  globbuf.gl_offs = 0;
+  glob_t globbuf = {0};
 
   /* Component available only if Omni-Path hardware is present */
   res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
-  if (0 == res || GLOB_NOMATCH == res) {
+  if (globbuf.gl_pathc > 0) {
       globfree(&globbuf);
   }
   if (0 != res) {
       res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
-      if (0 == res || GLOB_NOMATCH == res) {
+      if (globbuf.gl_pathc > 0) {
           globfree(&globbuf);
       }
       if (0 != res) {
@@ -197,6 +174,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
 static int
 ompi_mtl_psm2_component_close(void)
 {
+#if OPAL_CUDA_SUPPORT
+    if (cuda_envvar_set) {
+        opal_unsetenv("PSM2_CUDA", &environ);
+    }
+#endif
     return OMPI_SUCCESS;
 }
 
@@ -240,6 +222,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
     int verno_minor = PSM2_VERNO_MINOR;
     int local_rank = -1, num_local_procs = 0;
     int num_total_procs = 0;
+#if OPAL_CUDA_SUPPORT
+    int ret;
+    char *cuda_env;
+    glob_t globbuf = {0};
+#endif
 
     /* Compute the total number of processes on this host and our local rank
      * on that node. We need to provide PSM2 with these values so it can
@@ -272,6 +259,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
       setenv("PSM2_DEVICES", "self,shm", 0);
     }
 
+#if OPAL_CUDA_SUPPORT
+    /*
+     * If using CUDA enabled Open MPI, the user likely intends to
+     * run with CUDA buffers. So, force-set the envvar here if user failed
+     * to set it.
+     */
+    ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
+    if (globbuf.gl_pathc > 0) {
+        globfree(&globbuf);
+    }
+
+    cuda_env = getenv("PSM2_CUDA");
+    if (!cuda_env && (0 == ret)) {
+        opal_show_help("help-mtl-psm2.txt",
+                       "no psm2 cuda env", true,
+                       ompi_process_info.nodename);
+        opal_setenv("PSM2_CUDA", "1", false, &environ);
+        cuda_envvar_set = true;
+    }
+#endif
+
     err = psm2_init(&verno_major, &verno_minor);
     if (err) {
       opal_show_help("help-mtl-psm2.txt",