Skip to content

Add support for GPU buffers for PSM2 MTL #4172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ompi/mca/mtl/mtl.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -61,6 +62,9 @@ typedef struct mca_mtl_request_t mca_mtl_request_t;
* MTL module flags
*/
#define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
#if OPAL_CUDA_SUPPORT
#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
#endif

/**
* Initialization routine for MTL component
Expand Down
18 changes: 17 additions & 1 deletion ompi/mca/mtl/psm2/help-mtl-psm2.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- text -*-
#
# Copyright (C) 2009. QLogic Corporation. All rights reserved.
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -45,3 +45,19 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
#
[message too big]
Message size %llu bigger than supported by PSM2 API. Max = %llu
#
[no psm2 cuda env]
Warning: Open MPI has detected that you are running in an environment with CUDA
devices present and that you are using Intel(r) Ompi-Path networking. However,
the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
networking library was not told how to handle CUDA support.

If your application uses CUDA buffers, you should set the environment variable
PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
can have performance implications on your application, or even cause it to
crash.

Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
environment variable to 1.

Local hostname: %s
6 changes: 5 additions & 1 deletion ompi/mca/mtl/psm2/mtl_psm2.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 QLogic Corporation. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -171,6 +171,10 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
/* register the psm2 progress function */
opal_progress_register(ompi_mtl_psm2_progress);

#if OPAL_CUDA_SUPPORT
ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
#endif

return OMPI_SUCCESS;
}

Expand Down
66 changes: 56 additions & 10 deletions ompi/mca/mtl/psm2/mtl_psm2_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
* Copyright (c) 2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -26,6 +28,7 @@
#include "opal/mca/event/event.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/opal_environ.h"
#include "ompi/proc/proc.h"

#include "mtl_psm2.h"
Expand All @@ -41,6 +44,10 @@

static int param_priority;

#if OPAL_CUDA_SUPPORT
static bool cuda_envvar_set = false;
#endif

static int ompi_mtl_psm2_component_open(void);
static int ompi_mtl_psm2_component_close(void);
static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
Expand Down Expand Up @@ -89,6 +96,7 @@ ompi_mtl_psm2_component_register(void)

/* set priority high enough to beat ob1's default (also set higher than psm) */
param_priority = 40;

(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
"priority", "Priority of the PSM2 MTL component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
Expand All @@ -102,16 +110,23 @@ ompi_mtl_psm2_component_register(void)
static int
ompi_mtl_psm2_component_open(void)
{
glob_t globbuf;
globbuf.gl_offs = 0;
int res;
glob_t globbuf = {0};

/* Component available only if Omni-Path hardware is present */
if ((glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf) != 0) &&
(glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf) != 0)) {
return OPAL_ERR_NOT_AVAILABLE;
res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
if (globbuf.gl_pathc > 0) {
globfree(&globbuf);
}
if (0 != res) {
res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
if (globbuf.gl_pathc > 0) {
globfree(&globbuf);
}
if (0 != res) {
return OPAL_ERR_NOT_AVAILABLE;
}
}

globfree(&globbuf);

/* Component available only if at least one hfi1 port is ACTIVE */
bool foundOnlineHfi1Port = false;
Expand Down Expand Up @@ -159,6 +174,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
static int
ompi_mtl_psm2_component_close(void)
{
#if OPAL_CUDA_SUPPORT
if (cuda_envvar_set) {
opal_unsetenv("PSM2_CUDA", &environ);
}
#endif
return OMPI_SUCCESS;
}

Expand Down Expand Up @@ -202,6 +222,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
int verno_minor = PSM2_VERNO_MINOR;
int local_rank = -1, num_local_procs = 0;
int num_total_procs = 0;
#if OPAL_CUDA_SUPPORT
int ret;
char *cuda_env;
glob_t globbuf = {0};
#endif

/* Compute the total number of processes on this host and our local rank
* on that node. We need to provide PSM2 with these values so it can
Expand Down Expand Up @@ -234,6 +259,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
setenv("PSM2_DEVICES", "self,shm", 0);
}

#if OPAL_CUDA_SUPPORT
/*
* If using CUDA enabled Open MPI, the user likely intends to
* run with CUDA buffers. So, force-set the envvar here if user failed
* to set it.
*/
ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
if (globbuf.gl_pathc > 0) {
globfree(&globbuf);
}

cuda_env = getenv("PSM2_CUDA");
if (!cuda_env && (0 == ret)) {
opal_show_help("help-mtl-psm2.txt",
"no psm2 cuda env", true,
ompi_process_info.nodename);
opal_setenv("PSM2_CUDA", "1", false, &environ);
cuda_envvar_set = true;
}
#endif

err = psm2_init(&verno_major, &verno_minor);
if (err) {
opal_show_help("help-mtl-psm2.txt",
Expand Down
44 changes: 33 additions & 11 deletions ompi/mca/pml/cm/pml_cm.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -79,6 +80,7 @@ mca_pml_cm_irecv_init(void *addr,
struct ompi_request_t **request)
{
mca_pml_cm_hvy_recv_request_t *recvreq;
uint32_t flags = 0;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t* ompi_proc;
#endif
Expand All @@ -87,7 +89,7 @@ mca_pml_cm_irecv_init(void *addr,
if( OPAL_UNLIKELY(NULL == recvreq) ) return OMPI_ERR_OUT_OF_RESOURCE;

MCA_PML_CM_HVY_RECV_REQUEST_INIT(recvreq, ompi_proc, comm, tag, src,
datatype, addr, count, true);
datatype, addr, count, flags, true);

*request = (ompi_request_t*) recvreq;

Expand All @@ -104,6 +106,7 @@ mca_pml_cm_irecv(void *addr,
struct ompi_request_t **request)
{
int ret;
uint32_t flags = 0;
mca_pml_cm_thin_recv_request_t *recvreq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t* ompi_proc = NULL;
Expand All @@ -118,7 +121,8 @@ mca_pml_cm_irecv(void *addr,
src,
datatype,
addr,
count);
count,
flags);

MCA_PML_CM_THIN_RECV_REQUEST_START(recvreq, comm, tag, src, ret);

Expand All @@ -145,6 +149,7 @@ mca_pml_cm_recv(void *addr,
ompi_status_public_t * status)
{
int ret;
uint32_t flags = 0;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t *ompi_proc;
#endif
Expand Down Expand Up @@ -173,20 +178,24 @@ mca_pml_cm_recv(void *addr,
ompi_proc = ompi_comm_peer_lookup( comm, src );
}

MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);

opal_convertor_copy_and_prepare_for_recv(
ompi_proc->super.proc_convertor,
&(datatype->super),
count,
addr,
0,
flags,
&convertor );
#else
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);

opal_convertor_copy_and_prepare_for_recv(
ompi_mpi_local_convertor,
&(datatype->super),
count,
addr,
0,
flags,
&convertor );
#endif

Expand Down Expand Up @@ -222,6 +231,7 @@ mca_pml_cm_isend_init(const void* buf,
ompi_request_t** request)
{
mca_pml_cm_hvy_send_request_t *sendreq;
uint32_t flags = 0;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t* ompi_proc;
#endif
Expand All @@ -230,7 +240,7 @@ mca_pml_cm_isend_init(const void* buf,
if (OPAL_UNLIKELY(NULL == sendreq)) return OMPI_ERR_OUT_OF_RESOURCE;

MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
datatype, sendmode, true, false, buf, count);
datatype, sendmode, true, false, buf, count, flags);

/* Work around a leak in start by marking this request as complete. The
* problem occured because we do not have a way to differentiate an
Expand All @@ -254,6 +264,7 @@ mca_pml_cm_isend(const void* buf,
ompi_request_t** request)
{
int ret;
uint32_t flags = 0;

if(sendmode == MCA_PML_BASE_SEND_BUFFERED ) {
mca_pml_cm_hvy_send_request_t* sendreq;
Expand All @@ -274,7 +285,8 @@ mca_pml_cm_isend(const void* buf,
false,
false,
buf,
count);
count,
flags);

MCA_PML_CM_HVY_SEND_REQUEST_START( sendreq, ret);

Expand All @@ -296,7 +308,8 @@ mca_pml_cm_isend(const void* buf,
datatype,
sendmode,
buf,
count);
count,
flags);

MCA_PML_CM_THIN_SEND_REQUEST_START(
sendreq,
Expand Down Expand Up @@ -324,6 +337,7 @@ mca_pml_cm_send(const void *buf,
ompi_communicator_t* comm)
{
int ret = OMPI_ERROR;
uint32_t flags = 0;
ompi_proc_t * ompi_proc;

if(sendmode == MCA_PML_BASE_SEND_BUFFERED) {
Expand All @@ -342,7 +356,8 @@ mca_pml_cm_send(const void *buf,
false,
false,
buf,
count);
count,
flags);
MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
MCA_PML_CM_HVY_SEND_REQUEST_RETURN(sendreq);
Expand All @@ -368,9 +383,12 @@ mca_pml_cm_send(const void *buf,
#endif
{
ompi_proc = ompi_comm_peer_lookup(comm, dst);

MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);

opal_convertor_copy_and_prepare_for_send(
ompi_proc->super.proc_convertor,
&datatype->super, count, buf, 0,
&datatype->super, count, buf, flags,
&convertor);
}

Expand Down Expand Up @@ -459,6 +477,7 @@ mca_pml_cm_imrecv(void *buf,
struct ompi_request_t **request)
{
int ret;
uint32_t flags = 0;
mca_pml_cm_thin_recv_request_t *recvreq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t* ompi_proc;
Expand All @@ -474,7 +493,8 @@ mca_pml_cm_imrecv(void *buf,
(*message)->peer,
datatype,
buf,
count);
count,
flags);

MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq, message, ret);

Expand All @@ -491,6 +511,7 @@ mca_pml_cm_mrecv(void *buf,
ompi_status_public_t* status)
{
int ret;
uint32_t flags = 0;
mca_pml_cm_thin_recv_request_t *recvreq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
ompi_proc_t* ompi_proc;
Expand All @@ -506,7 +527,8 @@ mca_pml_cm_mrecv(void *buf,
(*message)->peer,
datatype,
buf,
count);
count,
flags);

MCA_PML_CM_THIN_RECV_REQUEST_MATCHED_START(recvreq,
message, ret);
Expand Down
Loading