open-mpi · hppritcha · Oct 31, 2017 · Sep 1, 2017 · Sep 30, 2017 · Dec 19, 2016
diff --git a/ompi/mca/mtl/mtl.h b/ompi/mca/mtl/mtl.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -61,6 +62,9 @@ typedef struct mca_mtl_request_t mca_mtl_request_t;
  * MTL module flags
  */
 #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
+#if OPAL_CUDA_SUPPORT
+#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
+#endif
 
 /**
  * Initialization routine for MTL component

diff --git a/ompi/mca/mtl/psm2/help-mtl-psm2.txt b/ompi/mca/mtl/psm2/help-mtl-psm2.txt
@@ -1,7 +1,7 @@
 # -*- text -*-
 #
 # Copyright (C) 2009. QLogic Corporation.  All rights reserved.
-# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
+# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -45,3 +45,19 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
 #
 [message too big]
 Message size %llu bigger than supported by PSM2 API. Max = %llu
+#
+[no psm2 cuda env]
+Warning: Open MPI has detected that you are running in an environment with CUDA
+devices present and that you are using Intel(r) Ompi-Path networking. However,
+the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
+networking library was not told how to handle CUDA support.
+
+If your application uses CUDA buffers, you should set the environment variable
+PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
+can have performance implications on your application, or even cause it to
+crash.
+
+Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
+environment variable to 1.
+
+Local hostname: %s
diff --git a/ompi/mca/mtl/psm2/mtl_psm2.c b/ompi/mca/mtl/psm2/mtl_psm2.c
@@ -11,7 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
- * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
+ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
  * Copyright (c) 2014      Los Alamos National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -171,6 +171,10 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
     /* register the psm2 progress function */
     opal_progress_register(ompi_mtl_psm2_progress);
 
+#if OPAL_CUDA_SUPPORT
+    ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
+#endif
+
     return OMPI_SUCCESS;
 }
 

diff --git a/ompi/mca/mtl/psm2/mtl_psm2_component.c b/ompi/mca/mtl/psm2/mtl_psm2_component.c
@@ -11,9 +11,11 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
- * Copyright (c) 2012-2015 Los Alamos National Security, LLC.
- *                         All rights reserved.
- * Copyright (c) 2013-2016 Intel, Inc. All rights reserved
+ * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved
+ * Copyright (c) 2017      Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -26,6 +28,7 @@
 #include "opal/mca/event/event.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
+#include "opal/util/opal_environ.h"
 #include "ompi/proc/proc.h"
 
 #include "mtl_psm2.h"
@@ -41,6 +44,10 @@
 
 static int param_priority;
 
+#if OPAL_CUDA_SUPPORT
+static bool cuda_envvar_set = false;
+#endif
+
 static int ompi_mtl_psm2_component_open(void);
 static int ompi_mtl_psm2_component_close(void);
 static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
@@ -89,6 +96,7 @@ ompi_mtl_psm2_component_register(void)
 
     /* set priority high enough to beat ob1's default (also set higher than psm) */
     param_priority = 40;
+
     (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
                                             "priority", "Priority of the PSM2 MTL component",
                                             MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
@@ -102,16 +110,23 @@ ompi_mtl_psm2_component_register(void)
 static int
 ompi_mtl_psm2_component_open(void)
 {
-  glob_t globbuf;
-  globbuf.gl_offs = 0;
+  int res;
+  glob_t globbuf = {0};
 
   /* Component available only if Omni-Path hardware is present */
-  if ((glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf) != 0) &&
-      (glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf) != 0)) {
-    return OPAL_ERR_NOT_AVAILABLE;
+  res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
+  if (globbuf.gl_pathc > 0) {
+      globfree(&globbuf);
+  }
+  if (0 != res) {
+      res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
+      if (globbuf.gl_pathc > 0) {
+          globfree(&globbuf);
+      }
+      if (0 != res) {
+          return OPAL_ERR_NOT_AVAILABLE;
+      }
   }
-
-  globfree(&globbuf);
 
   /* Component available only if at least one hfi1 port is ACTIVE */
   bool foundOnlineHfi1Port = false;
@@ -159,6 +174,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
 static int
 ompi_mtl_psm2_component_close(void)
 {
+#if OPAL_CUDA_SUPPORT
+    if (cuda_envvar_set) {
+        opal_unsetenv("PSM2_CUDA", &environ);
+    }
+#endif
     return OMPI_SUCCESS;
 }
 
@@ -202,6 +222,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
     int verno_minor = PSM2_VERNO_MINOR;
     int local_rank = -1, num_local_procs = 0;
     int num_total_procs = 0;
+#if OPAL_CUDA_SUPPORT
+    int ret;
+    char *cuda_env;
+    glob_t globbuf = {0};
+#endif
 
     /* Compute the total number of processes on this host and our local rank
      * on that node. We need to provide PSM2 with these values so it can
@@ -234,6 +259,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
       setenv("PSM2_DEVICES", "self,shm", 0);
     }
 
+#if OPAL_CUDA_SUPPORT
+    /*
+     * If using CUDA enabled Open MPI, the user likely intends to
+     * run with CUDA buffers. So, force-set the envvar here if user failed
+     * to set it.
+     */
+    ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
+    if (globbuf.gl_pathc > 0) {
+        globfree(&globbuf);
+    }
+
+    cuda_env = getenv("PSM2_CUDA");
+    if (!cuda_env && (0 == ret)) {
+        opal_show_help("help-mtl-psm2.txt",
+                       "no psm2 cuda env", true,
+                       ompi_process_info.nodename);
+        opal_setenv("PSM2_CUDA", "1", false, &environ);
+        cuda_envvar_set = true;
+    }
+#endif
+
     err = psm2_init(&verno_major, &verno_minor);
     if (err) {
       opal_show_help("help-mtl-psm2.txt",

diff --git a/ompi/mca/pml/cm/pml_cm.h b/ompi/mca/pml/cm/pml_cm.h
@@ -6,6 +6,7 @@
  *                         reserved.
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2017      Intel, Inc. All rights reserved
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -79,6 +80,7 @@ mca_pml_cm_irecv_init(void *addr,
                       struct ompi_request_t **request)
 {
     mca_pml_cm_hvy_recv_request_t *recvreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
 #endif
@@ -87,7 +89,7 @@ mca_pml_cm_irecv_init(void *addr,
     if( OPAL_UNLIKELY(NULL == recvreq) ) return OMPI_ERR_OUT_OF_RESOURCE;
 
     MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src,
-                                     datatype, addr, count, true);
+                                     datatype, addr, count, flags, true);
 
     *request = (ompi_request_t*) recvreq;
 
@@ -104,6 +106,7 @@ mca_pml_cm_irecv(void *addr,
                  struct ompi_request_t **request)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc = NULL;
@@ -118,7 +121,8 @@ mca_pml_cm_irecv(void *addr,
                                       src,
                                       datatype,
                                       addr,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);
 
@@ -145,6 +149,7 @@ mca_pml_cm_recv(void *addr,
                 ompi_status_public_t * status)
 {
     int ret;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t *ompi_proc;
 #endif
@@ -173,20 +178,24 @@ mca_pml_cm_recv(void *addr,
         ompi_proc = ompi_comm_peer_lookup( comm, src );
     }
 
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
     opal_convertor_copy_and_prepare_for_recv(
 	ompi_proc->super.proc_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+                flags,
 		&convertor );
 #else
+    MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
     opal_convertor_copy_and_prepare_for_recv(
 	ompi_mpi_local_convertor,
 		&(datatype->super),
 		count,
 		addr,
-		0,
+                flags,
 		&convertor );
 #endif
 
@@ -222,6 +231,7 @@ mca_pml_cm_isend_init(const void* buf,
                         ompi_request_t** request)
 {
     mca_pml_cm_hvy_send_request_t *sendreq;
+    uint32_t flags = 0;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
 #endif
@@ -230,7 +240,7 @@ mca_pml_cm_isend_init(const void* buf,
     if (OPAL_UNLIKELY(NULL == sendreq)) return OMPI_ERR_OUT_OF_RESOURCE;
 
     MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
-                                     datatype, sendmode, true, false, buf, count);
+                                     datatype, sendmode, true, false, buf, count, flags);
 
     /* Work around a leak in start by marking this request as complete. The
      * problem occured because we do not have a way to differentiate an
@@ -254,6 +264,7 @@ mca_pml_cm_isend(const void* buf,
                    ompi_request_t** request)
 {
     int ret;
+    uint32_t flags = 0;
 
     if(sendmode == MCA_PML_BASE_SEND_BUFFERED ) {
         mca_pml_cm_hvy_send_request_t* sendreq;
@@ -274,7 +285,8 @@ mca_pml_cm_isend(const void* buf,
                                          false,
                                          false,
                                          buf,
-                                         count);
+                                         count,
+                                         flags);
 
         MCA_PML_CM_HVY_SEND_REQUEST_START( sendreq, ret);
 
@@ -296,7 +308,8 @@ mca_pml_cm_isend(const void* buf,
                                           datatype,
                                           sendmode,
                                           buf,
-                                          count);
+                                          count,
+                                          flags);
 
         MCA_PML_CM_THIN_SEND_REQUEST_START(
                                            sendreq,
@@ -324,6 +337,7 @@ mca_pml_cm_send(const void *buf,
                 ompi_communicator_t* comm)
 {
     int ret = OMPI_ERROR;
+    uint32_t flags = 0;
     ompi_proc_t * ompi_proc;
 
     if(sendmode == MCA_PML_BASE_SEND_BUFFERED) {
@@ -342,7 +356,8 @@ mca_pml_cm_send(const void *buf,
                                          false,
                                          false,
                                          buf,
-                                         count);
+                                         count,
+                                         flags);
         MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret);
         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
             MCA_PML_CM_HVY_SEND_REQUEST_RETURN(sendreq);
@@ -368,9 +383,12 @@ mca_pml_cm_send(const void *buf,
 #endif
 	{
 		ompi_proc = ompi_comm_peer_lookup(comm, dst);
+
+                MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+
 		opal_convertor_copy_and_prepare_for_send(
 		ompi_proc->super.proc_convertor,
-			&datatype->super, count, buf, 0,
+			&datatype->super, count, buf, flags,
 			&convertor);
 	}
 
@@ -459,6 +477,7 @@ mca_pml_cm_imrecv(void *buf,
                   struct ompi_request_t **request)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
@@ -474,7 +493,8 @@ mca_pml_cm_imrecv(void *buf,
                                       (*message)->peer,
                                       datatype,
                                       buf,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);
 
@@ -491,6 +511,7 @@ mca_pml_cm_mrecv(void *buf,
                  ompi_status_public_t* status)
 {
     int ret;
+    uint32_t flags = 0;
     mca_pml_cm_thin_recv_request_t *recvreq;
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
     ompi_proc_t* ompi_proc;
@@ -506,7 +527,8 @@ mca_pml_cm_mrecv(void *buf,
                                       (*message)->peer,
                                       datatype,
                                       buf,
-                                      count);
+                                      count,
+                                      flags);
 
     MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq,
                                                message, ret);