Skip to content

Commit df48ddd

Browse files
author
Ralph Castain
authored
Merge pull request #4323 from aravindksg/fix_help_text
Move help text output regarding PSM2_CUDA environment variable
2 parents 5d208a1 + bea4503 commit df48ddd

File tree

2 files changed

+55
-35
lines changed

2 files changed

+55
-35
lines changed

ompi/mca/mtl/psm2/help-mtl-psm2.txt

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- text -*-
22
#
33
# Copyright (C) 2009. QLogic Corporation. All rights reserved.
4-
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
4+
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
55
# $COPYRIGHT$
66
#
77
# Additional copyrights may follow
@@ -47,5 +47,17 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
4747
Message size %llu bigger than supported by PSM2 API. Max = %llu
4848
#
4949
[no psm2 cuda env]
50-
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
51-
This is not a recommended combination. If the application uses %s.
50+
Warning: Open MPI has detected that you are running in an environment with CUDA
51+
devices present and that you are using Intel(r) Ompi-Path networking. However,
52+
the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
53+
networking library was not told how to handle CUDA support.
54+
55+
If your application uses CUDA buffers, you should set the environment variable
56+
PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
57+
can have performance implications on your application, or even cause it to
58+
crash.
59+
60+
Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
61+
environment variable to 1.
62+
63+
Local hostname: %s

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
1414
* Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
16+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
1717
* Copyright (c) 2017 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
@@ -28,6 +28,7 @@
2828
#include "opal/mca/event/event.h"
2929
#include "opal/util/output.h"
3030
#include "opal/util/show_help.h"
31+
#include "opal/util/opal_environ.h"
3132
#include "ompi/proc/proc.h"
3233

3334
#include "mtl_psm2.h"
@@ -45,6 +46,10 @@ static int param_priority;
4546
/* MPI_THREAD_MULTIPLE_SUPPORT */
4647
opal_mutex_t mtl_psm2_mq_mutex = OPAL_MUTEX_STATIC_INIT;
4748

49+
#if OPAL_CUDA_SUPPORT
50+
static bool cuda_envvar_set = false;
51+
#endif
52+
4853
static int ompi_mtl_psm2_component_open(void);
4954
static int ompi_mtl_psm2_component_close(void);
5055
static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
@@ -201,9 +206,6 @@ static int
201206
ompi_mtl_psm2_component_register(void)
202207
{
203208
int num_local_procs, num_total_procs;
204-
#if OPAL_CUDA_SUPPORT
205-
char *cuda_env;
206-
#endif
207209

208210
ompi_mtl_psm2.connect_timeout = 180;
209211
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
@@ -228,30 +230,6 @@ ompi_mtl_psm2_component_register(void)
228230
param_priority = 40;
229231
}
230232

231-
#if OPAL_CUDA_SUPPORT
232-
/*
233-
* If using CUDA enabled OpenMPI, the user likely intends to
234-
* run with CUDA buffers. So, force-set the envvar here if user failed
235-
* to set it.
236-
*/
237-
cuda_env = getenv("PSM2_CUDA");
238-
if (!cuda_env) {
239-
opal_show_help("help-mtl-psm2.txt",
240-
"no psm2 cuda env", true,
241-
"not set",
242-
"Host buffers,\nthere will be a performance penalty"
243-
" due to OMPI force setting this variable now.\n"
244-
"Set environment variable to 0 if using Host buffers" );
245-
setenv("PSM2_CUDA", "1", 0);
246-
} else if (strcmp(cuda_env, "0") == 0) {
247-
opal_show_help("help-mtl-psm2.txt",
248-
"no psm2 cuda env", true,
249-
"set to 0",
250-
"CUDA buffers,\nthe execution will SEGFAULT."
251-
" Set environment variable to 1 if using CUDA buffers");
252-
}
253-
#endif
254-
255233
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
256234
"priority", "Priority of the PSM2 MTL component",
257235
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
@@ -272,17 +250,16 @@ static int
272250
ompi_mtl_psm2_component_open(void)
273251
{
274252
int res;
275-
glob_t globbuf;
276-
globbuf.gl_offs = 0;
253+
glob_t globbuf = {0};
277254

278255
/* Component available only if Omni-Path hardware is present */
279256
res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
280-
if (0 == res || GLOB_NOMATCH == res) {
257+
if (globbuf.gl_pathc > 0) {
281258
globfree(&globbuf);
282259
}
283260
if (0 != res) {
284261
res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
285-
if (0 == res || GLOB_NOMATCH == res) {
262+
if (globbuf.gl_pathc > 0) {
286263
globfree(&globbuf);
287264
}
288265
if (0 != res) {
@@ -336,6 +313,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
336313
static int
337314
ompi_mtl_psm2_component_close(void)
338315
{
316+
#if OPAL_CUDA_SUPPORT
317+
if (cuda_envvar_set) {
318+
opal_unsetenv("PSM2_CUDA", &environ);
319+
}
320+
#endif
339321
return OMPI_SUCCESS;
340322
}
341323

@@ -362,6 +344,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
362344
int verno_major = PSM2_VERNO_MAJOR;
363345
int verno_minor = PSM2_VERNO_MINOR;
364346
int local_rank = -1, num_local_procs = 0;
347+
#if OPAL_CUDA_SUPPORT
348+
int ret;
349+
char *cuda_env;
350+
glob_t globbuf = {0};
351+
#endif
365352

366353
/* Compute the total number of processes on this host and our local rank
367354
* on that node. We need to provide PSM2 with these values so it can
@@ -389,6 +376,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
389376
ompi_mtl_psm2_set_shadow_env (ompi_mtl_psm2_shadow_variables + i);
390377
}
391378

379+
#if OPAL_CUDA_SUPPORT
380+
/*
381+
* If using CUDA enabled Open MPI, the user likely intends to
382+
* run with CUDA buffers. So, force-set the envvar here if user failed
383+
* to set it.
384+
*/
385+
ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
386+
if (globbuf.gl_pathc > 0) {
387+
globfree(&globbuf);
388+
}
389+
390+
cuda_env = getenv("PSM2_CUDA");
391+
if (!cuda_env && (0 == ret)) {
392+
opal_show_help("help-mtl-psm2.txt",
393+
"no psm2 cuda env", true,
394+
ompi_process_info.nodename);
395+
opal_setenv("PSM2_CUDA", "1", false, &environ);
396+
cuda_envvar_set = true;
397+
}
398+
#endif
399+
392400
err = psm2_init(&verno_major, &verno_minor);
393401
if (err) {
394402
opal_show_help("help-mtl-psm2.txt",

0 commit comments

Comments
 (0)