artpol84 · Greatrandom · Jun 6, 2015 · Jun 6, 2015 · Jun 8, 2015
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -849,7 +849,6 @@ mca_btl_smcuda_component_init(int *num_btls,
     mca_common_cuda_stage_one_init();
     mca_common_cuda_bind();
 #endif /* OPAL_CUDA_SUPPORT */
-
     /* if no session directory was created, then we cannot be used */
     if (NULL == opal_process_info.job_session_dir) {
     /* SKG - this isn't true anymore. Some backing facilities don't require a

diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
@@ -42,6 +42,7 @@
 #include "opal/runtime/opal_params.h"
 #include "opal/mca/timer/base/base.h"
 #include "opal/mca/dl/base/base.h"
+#include "opal/mca/hwloc/base/base.h"
 
 #include "common_cuda.h"
 
@@ -473,26 +474,46 @@ int mca_common_cuda_stage_one_init(void)
 #if OPAL_CUDA_GET_ATTRIBUTES
     OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes);
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
-    OPAL_CUDA_DLSYM(libcuda_handle, cudaDeviceGetByPCIBusId);
-    OPAL_CUDA_DLSYM(libcuda_handle, cudaSetDevice);
+    //OPAL_CUDA_DLSYM(libcuda_handle, cudaDeviceGetByPCIBusId);
+    //OPAL_CUDA_DLSYM(libcuda_handle, cudaSetDevice);
 
 
     return 0;
 }
 
 void mca_common_cuda_bind()
 {
-    char *mca_name, *mca_val, *PciBusId;
-    int dev, numaid, gpuid;
+    {
+        int delay = 1;
+        while(delay){
+            sleep(1);
+        }
+    }
+    char *mca_name, *mca_val;
+    char pciBusId[16];
+    int dev, obj_type, obj_idx, gpuid;
     hwloc_obj_t bind_gpu;
+    hwloc_obj_t obj;
     (void) mca_base_var_env_name ("rmaps_gpu_no", &mca_name);
     mca_val = getenv(mca_name);
     /* parse mcaval onto numaid and gpuid */
-    sscanf(mca_val , "%d:%d", &numaid, &gpuid);
-    bind_gpu = opal_hwloc_base_gpu_pci_ids(numaid,gpuid);
-    PciBusId = bind_gpu->attr->pcidev.bus;
-    cudaDeviceGetByPCIBusId(&dev, PciBusId);
+    sscanf(mca_val , "%d:%d:%d", &obj_type, &obj_idx, &gpuid);
+
+    if( obj_type == 0 ){
+        obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
+                                              HWLOC_OBJ_MACHINE, 0,
+                                              0, OPAL_HWLOC_AVAILABLE);
+    } else {
+        obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
+                                              HWLOC_OBJ_NODE, 0,
+                                              obj_idx, OPAL_HWLOC_AVAILABLE);
+    }
+
+    bind_gpu = opal_hwloc_get_gpu_by_idx(gpuid, obj);
 
+    sprintf(pciBusId, "%.2x:%.2x:%.2x.%x", bind_gpu->attr->pcidev.domain, bind_gpu->attr->pcidev.bus,
+            bind_gpu->attr->pcidev.dev, bind_gpu->attr->pcidev.func);
+    cudaDeviceGetByPCIBusId(&dev, pciBusId);
     cudaSetDevice(dev);
 }
 

diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h
@@ -97,9 +97,9 @@ OPAL_DECLSPEC int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *poli
  */
 OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
 
-OPAL_DECLSPEC int test_find_gpu(hwloc_obj_t obj);
 OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno);
-OPAL_DECLSPEC hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx);
+OPAL_DECLSPEC hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx, hwloc_obj_t obj);
+OPAL_DECLSPEC int opal_hwloc_prefind_gpu(hwloc_obj_t obj);
 
 struct opal_rmaps_numa_node_t {
     opal_list_item_t super;

diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c
@@ -105,53 +105,73 @@ hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
     return obj;
 }
 
-static int gpuIndex = 0;
-static hwloc_obj_t gpus[16] = {0};
-
-int test_find_gpu(hwloc_obj_t obj){
+int opal_hwloc_prefind_gpu(hwloc_obj_t obj)
+{
+    unsigned int gpu_cnt = 0;
     hwloc_obj_t child;
     //int result = 0;
 
     if(obj->attr->pcidev.vendor_id == 0x10de)
-        gpus[gpuIndex++] = obj;
+       gpu_cnt++;
     child = obj->first_child;
     while(child){
-        test_find_gpu(child);
+        gpu_cnt += opal_hwloc_prefind_gpu(child);
         child = child->next_sibling;
     }
-
-    int ret_value = gpuIndex;
-    //gpuIndex = 0;
-    return ret_value;
+    return gpu_cnt;
 }
 
-hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx){
-
-    return gpus[idx];
+static hwloc_obj_t get_gpu_cnt(int idx, hwloc_obj_t obj, int *cidx){
+    int gpu_idx = *cidx;
+    hwloc_obj_t child;
+    if(obj->attr->pcidev.vendor_id == 0x10de)
+       gpu_idx++;
+    *cidx = gpu_idx;
+    if( gpu_idx == idx ){
+        goto exit;
+    }
+    child = obj->first_child;
+    while(child){
+        obj = get_gpu_cnt(idx, child, cidx);
+        if( idx == *cidx ){
+            goto exit;
+        }
+        child = child->next_sibling;
+    }
+exit:
+    return obj;
 }
 
-
-
-hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno)
-{
-    int *ret = NULL;
-    hwloc_obj_t machine;
-    /* Similar to http://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-support
-     * 1. Get NUMA by index
-     * 2. Find BRIDGE
-     * 3. Get gpudevice's hwloc_obj_t structure into gpu
-     */
-    /*machine = hwloc_get_root_obj(opal_hwloc_topology);
-    if(machine->arity < numa){
-        //test_find_gpu(machine->children[numa]);
-    }
-    else return NULL;
-    */
-    if (gpuIndex>devno)
-        return gpus[devno];
-    else return NULL;
+hwloc_obj_t opal_hwloc_get_gpu_by_idx(int idx, hwloc_obj_t obj){
+    int cidx = -1;
+    hwloc_obj_t nobj = get_gpu_cnt(idx, obj, &cidx);
+    if( cidx != idx ){
+        // TODO_NV: output in verbose case
+        return NULL;
+    }
+    return nobj;
 }
 
+//hwloc_obj_t opal_hwloc_base_gpu_pci_ids(int numa, int devno)
+//{
+//    int *ret = NULL;
+//    hwloc_obj_t machine;
+//    /* Similar to http://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-support
+//     * 1. Get NUMA by index
+//     * 2. Find BRIDGE
+//     * 3. Get gpudevice's hwloc_obj_t structure into gpu
+//     */
+//    /*machine = hwloc_get_root_obj(opal_hwloc_topology);
+//    if(machine->arity < numa){
+//        //test_find_gpu(machine->children[numa]);
+//    }
+//    else return NULL;
+//    */
+//    if (gpuIndex>devno)
+//        return gpus[devno];
+//    else return NULL;
+//}
+
 /* determine the node-level available cpuset based on
  * online vs allowed vs user-specified cpus
  */

diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c
@@ -741,6 +741,9 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
     case OPAL_BIND_TO_HWTHREAD:
         hwb = HWLOC_OBJ_PU;
         break;
+    case OPAL_BIND_TO_GPU:
+        // TODO: Is it correct to do so here?
+        hwb = HWLOC_OBJ_NODE;
     default:
         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
         return ORTE_ERR_BAD_PARAM;
@@ -802,11 +805,10 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
      * procs to the resources below.
      */
 
-    if (ORTE_MAPPING_BYDIST == map
-#if (CUDA | OPEN_ACC)
-            || ORTE_MAPPING_BYGPU == map
-#endif
-            ) {
+    if (ORTE_MAPPING_BYDIST == map || ORTE_MAPPING_BYGPU == map)
+//#if (HAVE_CUDA )
+//#endif
+        {
         int rc = ORTE_SUCCESS;
         if (OPAL_BIND_TO_NUMA == bind) {
             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,

diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c
@@ -22,7 +22,6 @@
 #include <string.h>
 #endif  /* HAVE_STRING_H */
 
-
 #include "opal/mca/hwloc/base/base.h"
 #include "opal/util/argv.h"
 
@@ -95,12 +94,12 @@ static int ppr_mapper(orte_job_t *jdata)
     bool initial_map=true;
     bool gpu_mapping = false;
 
-    {
-        int delay = 1;
-        while( delay ){
-            sleep(1);
-        }
-    }
+//    {
+//        int delay = 1;
+//        while( delay ){
+//            sleep(1);
+//        }
+//    }
 
 
     /* only handle initial launch of loadbalanced
@@ -315,6 +314,49 @@ static int ppr_mapper(orte_job_t *jdata)
             /* if we are mapping solely at the node level, just put
              * that many procs on this node
              */
+#if (HAVE_CUDA)
+            if( gpu_mapping ){
+                hwloc_obj_type_t cur_level = lowest;
+                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
+                                                           cur_level, cache_level,
+                                                           OPAL_HWLOC_AVAILABLE);
+
+                if( nobjs == 0){
+                    cur_level = opal_hwloc_levels[OPAL_HWLOC_NODE_LEVEL];
+                    nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
+                                                               cur_level, cache_level,
+                                                               OPAL_HWLOC_AVAILABLE);
+                }
+                /* map the specified number of procs to each such resource on this node,
+                 * recording the locale of each proc so we know its cpuset
+                 */
+                for (i=0; i < nobjs; i++) {
+                    obj = opal_hwloc_base_get_obj_by_type(node->topology,
+                                                          cur_level, cache_level,
+                                                          i, OPAL_HWLOC_AVAILABLE);
+                    int k;
+                    int gpu_cnt;
+                    gpu_cnt = opal_hwloc_prefind_gpu(obj);
+
+                    if( 0 == gpu_cnt ){
+                        // skip this numa node (if gpu_mapping => start = NUMA_LEVEL)
+                        continue;
+                    }
+
+                    for( k=0; k < gpu_cnt; k++){
+                        for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
+                            if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
+                                rc = ORTE_ERR_OUT_OF_RESOURCE;
+                                goto error;
+                            }
+                            nprocs_mapped++;
+                            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
+                            orte_set_attribute(&proc->attributes, ORTE_PROC_GPU_ID, ORTE_ATTR_LOCAL, (void*)&k, OPAL_INT);
+                        }
+                    }
+                }
+            } else
+#endif
             if (OPAL_HWLOC_NODE_LEVEL == start) {
 #if OPAL_HAVE_HWLOC
                 obj = hwloc_get_root_obj(node->topology);
@@ -336,50 +378,13 @@ static int ppr_mapper(orte_job_t *jdata)
                                                            lowest, cache_level,
                                                            OPAL_HWLOC_AVAILABLE);
 
-                /* map the specified number of procs to each such resource on this node,
+                 /* map the specified number of procs to each such resource on this node,
                  * recording the locale of each proc so we know its cpuset
                  */
                 for (i=0; i < nobjs; i++) {
                     obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                                           lowest, cache_level,
                                                           i, OPAL_HWLOC_AVAILABLE);
-#if (HAVE_CUDA)
-                    if( gpu_mapping ){
-                        gpu_mapping = false;
-                        int k;
-
-                        //gpuno = discover_gpu(node->topology,obj);
-                        int gpuno;
-                        gpuno=test_find_gpu(obj);
-
-                        if( 0 == gpuno ){
-                            // skip this numa node (if gpu_mapping => start = NUMA_LEVEL)
-                            continue;
-                        }
-
-                        // We want GPU# * ppr processes per NUMA.
-                        //int proc_num = gpuno * ppr[start];
-                        // TODO: Check that this numa can handle this number of processes
-                        // It depends on what is considered as processing element.
-                        /*if( check(obj, proc_num, pe_type) ){
-                            rc = ORTE_ERR_OUT_OF_RESOURCE;
-                            goto error;
-                        }*/
-
-                        for( k=0; k < gpuno; k++){
-                            for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
-                                if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
-                                    rc = ORTE_ERR_OUT_OF_RESOURCE;
-                                    goto error;
-                                }
-                                nprocs_mapped++;
-                                orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
-                                orte_set_attribute(&proc->attributes, ORTE_PROC_GPU_ID, ORTE_ATTR_LOCAL, k, OPAL_INT);
-                            }
-                        }
-                        continue;
-                    }
-#endif
 
                     for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                         if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {