@@ -210,7 +210,6 @@ static int adjust_cq(mca_btl_openib_device_t *device, const int cq)
210
210
return OPAL_ERROR ;
211
211
}
212
212
213
- OPAL_THREAD_LOCK (& device -> device_lock );
214
213
if (!device -> progress ) {
215
214
int rc ;
216
215
device -> progress = true;
@@ -219,7 +218,6 @@ static int adjust_cq(mca_btl_openib_device_t *device, const int cq)
219
218
return rc ;
220
219
}
221
220
}
222
- OPAL_THREAD_UNLOCK (& device -> device_lock );
223
221
#endif
224
222
}
225
223
#ifdef HAVE_IBV_RESIZE_CQ
@@ -406,7 +404,7 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
406
404
return OPAL_SUCCESS ;
407
405
}
408
406
409
- static int mca_btl_openib_size_queues (struct mca_btl_openib_module_t * openib_btl , size_t nprocs )
407
+ static int mca_btl_openib_size_queues_nolock (struct mca_btl_openib_module_t * openib_btl , size_t nprocs )
410
408
{
411
409
uint32_t send_cqes , recv_cqes ;
412
410
int rc = OPAL_SUCCESS , qp ;
@@ -603,7 +601,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
603
601
return OPAL_SUCCESS ;
604
602
}
605
603
606
- static int prepare_device_for_use (mca_btl_openib_device_t * device )
604
+ static int prepare_device_for_use_nolock (mca_btl_openib_device_t * device )
607
605
{
608
606
mca_btl_openib_frag_init_data_t * init_data ;
609
607
int rc , length ;
@@ -920,7 +918,12 @@ static int init_ib_proc_nolock(mca_btl_openib_module_t* openib_btl, mca_btl_open
920
918
return OPAL_ERROR ;
921
919
}
922
920
921
+ /* protect device because several endpoints for different ib_proc's
922
+ * may be simultaneously initialized */
923
+ opal_mutex_lock (& openib_btl -> device -> device_lock );
923
924
endpoint -> index = opal_pointer_array_add (openib_btl -> device -> endpoints , (void * )endpoint );
925
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
926
+
924
927
if ( 0 > endpoint -> index ) {
925
928
OBJ_RELEASE (endpoint );
926
929
return OPAL_ERROR ;
@@ -981,21 +984,21 @@ int mca_btl_openib_add_procs(
981
984
#endif
982
985
983
986
/* protect the device */
984
- opal_mutex_lock (& mca_btl_openib_component . ib_lock );
985
- rc = prepare_device_for_use (openib_btl -> device );
987
+ opal_mutex_lock (& openib_btl -> device -> device_lock );
988
+ rc = prepare_device_for_use_nolock (openib_btl -> device );
986
989
if (OPAL_SUCCESS != rc ) {
987
990
BTL_ERROR (("could not prepare openib device for use" ));
988
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
991
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
989
992
return rc ;
990
993
}
991
994
992
- rc = mca_btl_openib_size_queues (openib_btl , nprocs );
995
+ rc = mca_btl_openib_size_queues_nolock (openib_btl , nprocs );
993
996
if (OPAL_SUCCESS != rc ) {
994
997
BTL_ERROR (("error creating cqs" ));
995
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
998
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
996
999
return rc ;
997
1000
}
998
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
1001
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
999
1002
1000
1003
for (i = 0 , local_procs = 0 ; i < (int ) nprocs ; i ++ ) {
1001
1004
struct opal_proc_t * proc = procs [i ];
@@ -1075,21 +1078,21 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
1075
1078
1076
1079
// TODO: shift to the separate function
1077
1080
/* protect the device */
1078
- opal_mutex_lock (& mca_btl_openib_component . ib_lock );
1079
- rc = prepare_device_for_use (openib_btl -> device );
1081
+ opal_mutex_lock (& openib_btl -> device -> device_lock );
1082
+ rc = prepare_device_for_use_nolock (openib_btl -> device );
1080
1083
if (OPAL_SUCCESS != rc ) {
1081
1084
BTL_ERROR (("could not prepare openib device for use" ));
1082
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
1085
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
1083
1086
return NULL ;
1084
1087
}
1085
1088
1086
- rc = mca_btl_openib_size_queues (openib_btl , 1 );
1089
+ rc = mca_btl_openib_size_queues_nolock (openib_btl , 1 );
1087
1090
if (OPAL_SUCCESS != rc ) {
1088
1091
BTL_ERROR (("error creating cqs" ));
1089
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
1092
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
1090
1093
return NULL ;
1091
1094
}
1092
- opal_mutex_unlock (& mca_btl_openib_component . ib_lock );
1095
+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
1093
1096
1094
1097
1095
1098
if (NULL == (ib_proc = mca_btl_openib_proc_get_locked (proc , & is_new ))) {
0 commit comments