Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion opal/mca/btl/smcuda/btl_smcuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,6 @@ mca_btl_smcuda_component_init(int *num_btls,
mca_common_cuda_stage_one_init();
mca_common_cuda_bind();
#endif /* OPAL_CUDA_SUPPORT */

/* if no session directory was created, then we cannot be used */
if (NULL == opal_process_info.job_session_dir) {
/* SKG - this isn't true anymore. Some backing facilities don't require a
Expand Down
37 changes: 29 additions & 8 deletions opal/mca/common/cuda/common_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "opal/runtime/opal_params.h"
#include "opal/mca/timer/base/base.h"
#include "opal/mca/dl/base/base.h"
#include "opal/mca/hwloc/base/base.h"

#include "common_cuda.h"

Expand Down Expand Up @@ -473,26 +474,46 @@ int mca_common_cuda_stage_one_init(void)
#if OPAL_CUDA_GET_ATTRIBUTES
OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
OPAL_CUDA_DLSYM(libcuda_handle, cudaDeviceGetByPCIBusId);
OPAL_CUDA_DLSYM(libcuda_handle, cudaSetDevice);
//OPAL_CUDA_DLSYM(libcuda_handle, cudaDeviceGetByPCIBusId);
//OPAL_CUDA_DLSYM(libcuda_handle, cudaSetDevice);


return 0;
}

void mca_common_cuda_bind()
{
char *mca_name, *mca_val, *PciBusId;
int dev, numaid, gpuid;
{
int delay = 1;
while(delay){
sleep(1);
}
}
char *mca_name, *mca_val;
char pciBusId[16];
int dev, obj_type, obj_idx, gpuid;
hwloc_obj_t bind_gpu;
hwloc_obj_t obj;
(void) mca_base_var_env_name ("rmaps_gpu_no", &mca_name);
mca_val = getenv(mca_name);
/* parse mcaval onto numaid and gpuid */
sscanf(mca_val , "%d:%d", &numaid, &gpuid);
bind_gpu = opal_hwloc_base_gpu_pci_ids(numaid,gpuid);
PciBusId = bind_gpu->attr->pcidev.bus;
cudaDeviceGetByPCIBusId(&dev, PciBusId);
sscanf(mca_val , "%d:%d:%d", &obj_type, &obj_idx, &gpuid);

if( obj_type == 0 ){
obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_MACHINE, 0,
0, OPAL_HWLOC_AVAILABLE);
} else {
obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
obj_idx, OPAL_HWLOC_AVAILABLE);
}

bind_gpu = opal_hwloc_get_gpu_by_idx(gpuid, obj);

sprintf(pciBusId, "%.2x:%.2x:%.2x.%x", bind_gpu->attr->pcidev.domain, bind_gpu->attr->pcidev.bus,
bind_gpu->attr->pcidev.dev, bind_gpu->attr->pcidev.func);
cudaDeviceGetByPCIBusId(&dev, pciBusId);
cudaSetDevice(dev);
}

Expand Down
4 changes: 2 additions & 2 deletions opal/mca/hwloc/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ OPAL_DECLSPEC int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *poli
*/
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);

OPAL_DECLSPEC int test_find_gpu(hwloc_obj_t obj);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx);
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx, hwloc_obj_t obj);
OPAL_DECLSPEC int opal_hwloc_prefind_gpu(hwloc_obj_t obj);

struct opal_rmaps_numa_node_t {
opal_list_item_t super;
Expand Down
86 changes: 53 additions & 33 deletions opal/mca/hwloc/base/hwloc_base_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,53 +105,73 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
return obj;
}

static int gpuIndex = 0;
static hwloc_obj_t gpus[16] = {0};

int test_find_gpu(hwloc_obj_t obj){
int opal_hwloc_prefind_gpu(hwloc_obj_t obj)
{
unsigned int gpu_cnt = 0;
hwloc_obj_t child;
//int result = 0;

if(obj->attr->pcidev.vendor_id == 0x10de)
gpus[gpuIndex++] = obj;
gpu_cnt++;
child = obj->first_child;
while(child){
test_find_gpu(child);
gpu_cnt += opal_hwloc_prefind_gpu(child);
child = child->next_sibling;
}

int ret_value = gpuIndex;
//gpuIndex = 0;
return ret_value;
return gpu_cnt;
}

hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx){

return gpus[idx];
static hwloc_obj_t get_gpu_cnt(int idx, hwloc_obj_t obj, int *cidx){
int gpu_idx = *cidx;
hwloc_obj_t child;
if(obj->attr->pcidev.vendor_id == 0x10de)
gpu_idx++;
*cidx = gpu_idx;
if( gpu_idx == idx ){
goto exit;
}
child = obj->first_child;
while(child){
obj = get_gpu_cnt(idx, child, cidx);
if( idx == *cidx ){
goto exit;
}
child = child->next_sibling;
}
exit:
return obj;
}



hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno)
{
int *ret = NULL;
hwloc_obj_t machine;
/* Similar to http://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-support
* 1. Get NUMA by index
* 2. Find BRIDGE
* 3. Get gpudevice's hwloc_obj_t structure into gpu
*/
/*machine = hwloc_get_root_obj(opal_hwloc_topology);
if(machine->arity < numa){
//test_find_gpu(machine->children[numa]);
}
else return NULL;
*/
if (gpuIndex>devno)
return gpus[devno];
else return NULL;
hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx, hwloc_obj_t obj){
int cidx = -1;
hwloc_obj_t nobj = get_gpu_cnt(idx, obj, &cidx);
if( cidx != idx ){
// TODO_NV: output in verbose case
return NULL;
}
return nobj;
}

//hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno)
//{
// int *ret = NULL;
// hwloc_obj_t machine;
// /* Similar to http://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-support
// * 1. Get NUMA by index
// * 2. Find BRIDGE
// * 3. Get gpudevice's hwloc_obj_t structure into gpu
// */
// /*machine = hwloc_get_root_obj(opal_hwloc_topology);
// if(machine->arity < numa){
// //test_find_gpu(machine->children[numa]);
// }
// else return NULL;
// */
// if (gpuIndex>devno)
// return gpus[devno];
// else return NULL;
//}

/* determine the node-level available cpuset based on
* online vs allowed vs user-specified cpus
*/
Expand Down
12 changes: 7 additions & 5 deletions orte/mca/rmaps/base/rmaps_base_binding.c
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,9 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
case OPAL_BIND_TO_HWTHREAD:
hwb = HWLOC_OBJ_PU;
break;
case OPAL_BIND_TO_GPU:
// TODO: Is it correct to do so here?
hwb = HWLOC_OBJ_NODE;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
Expand Down Expand Up @@ -802,11 +805,10 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
* procs to the resources below.
*/

if (ORTE_MAPPING_BYDIST == map
#if (CUDA | OPEN_ACC)
|| ORTE_MAPPING_BYGPU == map
#endif
) {
if (ORTE_MAPPING_BYDIST == map || ORTE_MAPPING_BYGPU == map)
//#if (HAVE_CUDA )
//#endif
{
int rc = ORTE_SUCCESS;
if (OPAL_BIND_TO_NUMA == bind) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
Expand Down
95 changes: 50 additions & 45 deletions orte/mca/rmaps/ppr/rmaps_ppr.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <string.h>
#endif /* HAVE_STRING_H */


#include "opal/mca/hwloc/base/base.h"
#include "opal/util/argv.h"

Expand Down Expand Up @@ -95,12 +94,12 @@ static int ppr_mapper(orte_job_t *jdata)
bool initial_map=true;
bool gpu_mapping = false;

{
int delay = 1;
while( delay ){
sleep(1);
}
}
// {
// int delay = 1;
// while( delay ){
// sleep(1);
// }
// }


/* only handle initial launch of loadbalanced
Expand Down Expand Up @@ -315,6 +314,49 @@ static int ppr_mapper(orte_job_t *jdata)
/* if we are mapping solely at the node level, just put
* that many procs on this node
*/
#if (HAVE_CUDA)
if( gpu_mapping ){
hwloc_obj_type_t cur_level = lowest;
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
cur_level, cache_level,
OPAL_HWLOC_AVAILABLE);

if( nobjs == 0){
cur_level = opal_hwloc_levels[OPAL_HWLOC_NODE_LEVEL];
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
cur_level, cache_level,
OPAL_HWLOC_AVAILABLE);
}
/* map the specified number of procs to each such resource on this node,
* recording the locale of each proc so we know its cpuset
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
cur_level, cache_level,
i, OPAL_HWLOC_AVAILABLE);
int k;
int gpu_cnt;
gpu_cnt = opal_hwloc_prefind_gpu(obj);

if( 0 == gpu_cnt ){
// skip this numa node (if gpu_mapping => start = NUMA_LEVEL)
continue;
}

for( k=0; k < gpu_cnt; k++){
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
orte_set_attribute(&proc->attributes, ORTE_PROC_GPU_ID, ORTE_ATTR_LOCAL, (void*)&k, OPAL_INT);
}
}
}
} else
#endif
if (OPAL_HWLOC_NODE_LEVEL == start) {
#if OPAL_HAVE_HWLOC
obj = hwloc_get_root_obj(node->topology);
Expand All @@ -336,50 +378,13 @@ static int ppr_mapper(orte_job_t *jdata)
lowest, cache_level,
OPAL_HWLOC_AVAILABLE);

/* map the specified number of procs to each such resource on this node,
/* map the specified number of procs to each such resource on this node,
* recording the locale of each proc so we know its cpuset
*/
for (i=0; i < nobjs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology,
lowest, cache_level,
i, OPAL_HWLOC_AVAILABLE);
#if (HAVE_CUDA)
if( gpu_mapping ){
gpu_mapping = false;
int k;

//gpuno = discover_gpu(node->topology,obj);
int gpuno;
gpuno=test_find_gpu(obj);

if( 0 == gpuno ){
// skip this numa node (if gpu_mapping => start = NUMA_LEVEL)
continue;
}

// We want GPU# * ppr processes per NUMA.
//int proc_num = gpuno * ppr[start];
// TODO: Check that this numa can handle this number of processes
// It depends on what is considered as processing element.
/*if( check(obj, proc_num, pe_type) ){
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}*/

for( k=0; k < gpuno; k++){
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto error;
}
nprocs_mapped++;
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
orte_set_attribute(&proc->attributes, ORTE_PROC_GPU_ID, ORTE_ATTR_LOCAL, k, OPAL_INT);
}
}
continue;
}
#endif

for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
Expand Down
Loading