Skip to content

Commit 2719f26

Browse files
committed
Move help text output regarding PSM2_CUDA envvar to component init phase
The messages should be printed only in the event of CUDA builds and in the presence of supporting hardware and when PSM2 MTL has actually been selected for use. To this end, move help text output to component init phase. Also use opal_setenv/unsetenv() for safer setting, unsetting of the environment variable and sanitize the help text message. Signed-off-by: Aravind Gopalakrishnan <[email protected]> (cherry picked from commit bea4503) Conflicts: ompi/mca/mtl/psm2/mtl_psm2_component.c
1 parent b0dc826 commit 2719f26

File tree

2 files changed

+57
-37
lines changed

2 files changed

+57
-37
lines changed

ompi/mca/mtl/psm2/help-mtl-psm2.txt

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- text -*-
22
#
33
# Copyright (C) 2009. QLogic Corporation. All rights reserved.
4-
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
4+
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
55
# $COPYRIGHT$
66
#
77
# Additional copyrights may follow
@@ -47,5 +47,17 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
4747
Message size %llu bigger than supported by PSM2 API. Max = %llu
4848
#
4949
[no psm2 cuda env]
50-
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
51-
This is not a recommended combination. If the application uses %s.
50+
Warning: Open MPI has detected that you are running in an environment with CUDA
51+
devices present and that you are using Intel(r) Ompi-Path networking. However,
52+
the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
53+
networking library was not told how to handle CUDA support.
54+
55+
If your application uses CUDA buffers, you should set the environment variable
56+
PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
57+
can have performance implications on your application, or even cause it to
58+
crash.
59+
60+
Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
61+
environment variable to 1.
62+
63+
Local hostname: %s

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
14-
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
15-
* All rights reserved.
16-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
14+
* Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
15+
* reserved.
16+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
1717
* Copyright (c) 2017 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
@@ -28,6 +28,7 @@
2828
#include "opal/mca/event/event.h"
2929
#include "opal/util/output.h"
3030
#include "opal/util/show_help.h"
31+
#include "opal/util/opal_environ.h"
3132
#include "ompi/proc/proc.h"
3233

3334
#include "mtl_psm2.h"
@@ -43,6 +44,10 @@
4344

4445
static int param_priority;
4546

47+
#if OPAL_CUDA_SUPPORT
48+
static bool cuda_envvar_set = false;
49+
#endif
50+
4651
static int ompi_mtl_psm2_component_open(void);
4752
static int ompi_mtl_psm2_component_close(void);
4853
static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
@@ -80,10 +85,6 @@ mca_mtl_psm2_component_t mca_mtl_psm2_component = {
8085
static int
8186
ompi_mtl_psm2_component_register(void)
8287
{
83-
#if OPAL_CUDA_SUPPORT
84-
char *cuda_env;
85-
#endif
86-
8788
ompi_mtl_psm2.connect_timeout = 180;
8889
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
8990
"connect_timeout",
@@ -95,29 +96,6 @@ ompi_mtl_psm2_component_register(void)
9596

9697
/* set priority high enough to beat ob1's default (also set higher than psm) */
9798
param_priority = 40;
98-
#if OPAL_CUDA_SUPPORT
99-
/*
100-
* If using CUDA enabled OpenMPI, the user likely intends to
101-
* run with CUDA buffers. So, force-set the envvar here if user failed
102-
* to set it.
103-
*/
104-
cuda_env = getenv("PSM2_CUDA");
105-
if (!cuda_env) {
106-
opal_show_help("help-mtl-psm2.txt",
107-
"no psm2 cuda env", true,
108-
"not set",
109-
"Host buffers,\nthere will be a performance penalty"
110-
" due to OMPI force setting this variable now.\n"
111-
"Set environment variable to 0 if using Host buffers" );
112-
setenv("PSM2_CUDA", "1", 0);
113-
} else if (strcmp(cuda_env, "0") == 0) {
114-
opal_show_help("help-mtl-psm2.txt",
115-
"no psm2 cuda env", true,
116-
"set to 0",
117-
"CUDA buffers,\nthe execution will SEGFAULT."
118-
" Set environment variable to 1 if using CUDA buffers");
119-
}
120-
#endif
12199

122100
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
123101
"priority", "Priority of the PSM2 MTL component",
@@ -133,17 +111,16 @@ static int
133111
ompi_mtl_psm2_component_open(void)
134112
{
135113
int res;
136-
glob_t globbuf;
137-
globbuf.gl_offs = 0;
114+
glob_t globbuf = {0};
138115

139116
/* Component available only if Omni-Path hardware is present */
140117
res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
141-
if (0 == res || GLOB_NOMATCH == res) {
118+
if (globbuf.gl_pathc > 0) {
142119
globfree(&globbuf);
143120
}
144121
if (0 != res) {
145122
res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
146-
if (0 == res || GLOB_NOMATCH == res) {
123+
if (globbuf.gl_pathc > 0) {
147124
globfree(&globbuf);
148125
}
149126
if (0 != res) {
@@ -197,6 +174,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
197174
static int
198175
ompi_mtl_psm2_component_close(void)
199176
{
177+
#if OPAL_CUDA_SUPPORT
178+
if (cuda_envvar_set) {
179+
opal_unsetenv("PSM2_CUDA", &environ);
180+
}
181+
#endif
200182
return OMPI_SUCCESS;
201183
}
202184

@@ -240,6 +222,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
240222
int verno_minor = PSM2_VERNO_MINOR;
241223
int local_rank = -1, num_local_procs = 0;
242224
int num_total_procs = 0;
225+
#if OPAL_CUDA_SUPPORT
226+
int ret;
227+
char *cuda_env;
228+
glob_t globbuf = {0};
229+
#endif
243230

244231
/* Compute the total number of processes on this host and our local rank
245232
* on that node. We need to provide PSM2 with these values so it can
@@ -272,6 +259,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
272259
setenv("PSM2_DEVICES", "self,shm", 0);
273260
}
274261

262+
#if OPAL_CUDA_SUPPORT
263+
/*
264+
* If using CUDA enabled Open MPI, the user likely intends to
265+
* run with CUDA buffers. So, force-set the envvar here if user failed
266+
* to set it.
267+
*/
268+
ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
269+
if (globbuf.gl_pathc > 0) {
270+
globfree(&globbuf);
271+
}
272+
273+
cuda_env = getenv("PSM2_CUDA");
274+
if (!cuda_env && (0 == ret)) {
275+
opal_show_help("help-mtl-psm2.txt",
276+
"no psm2 cuda env", true,
277+
ompi_process_info.nodename);
278+
opal_setenv("PSM2_CUDA", "1", false, &environ);
279+
cuda_envvar_set = true;
280+
}
281+
#endif
282+
275283
err = psm2_init(&verno_major, &verno_minor);
276284
if (err) {
277285
opal_show_help("help-mtl-psm2.txt",

0 commit comments

Comments
 (0)