Skip to content

Commit 7e18deb

Browse files
committed
Fix singleton tmp files cleanup
Only in singleton mode, directory cleaning needs to be done by the program itself. There are some problems with these parts of the code that cause the directory to not be cleaned. This commit fixes *some of* these issues. 1. btl/sm will not unlink its segments file. We never noticed this in non-singleton mode because pmix cleaned it up for us. After this, we can clean up the segment file created by sm in /dev/shm.(when singletons normally terminated) 2. Modified the singleton session directory structure and enabled recursive deletion. After this, we can cleanup the session dir. (when singletons normally terminated) 3. Fix a bug - local peer number of a singleton should be 0, not 1. After this, the btl/sm and btl/smcuda components will return NULL during their init process and will be automatically closed. btl/sm segment file in /dev/shm will never be created in singleton mode now. Signed-off-by: xbw <[email protected]>
1 parent c66b0c7 commit 7e18deb

File tree

3 files changed

+56
-21
lines changed

3 files changed

+56
-21
lines changed

ompi/runtime/ompi_rte.c

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
6868
* infrastructure that manages its structure (e.g., OpenPMIx). If we setup this
6969
* session directory structure, then we shall cleanup after ourselves.
7070
*/
71+
static bool destroy_top_session_dir = false;
7172
static bool destroy_job_session_dir = false;
7273
static bool destroy_proc_session_dir = false;
7374

@@ -888,8 +889,13 @@ int ompi_rte_init(int *pargc, char ***pargv)
888889
if (0 == opal_process_info.num_local_peers) {
889890
if (NULL != peers) {
890891
opal_process_info.num_local_peers = opal_argv_count(peers) - 1;
892+
} else if (opal_process_info.is_singleton) {
893+
/* if we are a singleton, then we have no local peers */
894+
opal_process_info.num_local_peers = 0;
891895
} else {
892-
opal_process_info.num_local_peers = 1;
896+
ret = OPAL_ERR_BAD_PARAM;
897+
error = "local peers";
898+
goto error;
893899
}
894900
}
895901
/* if my local rank if too high, then that's an error */
@@ -983,25 +989,28 @@ int ompi_rte_finalize(void)
983989
{
984990

985991
/* cleanup the session directory we created */
992+
if (NULL != opal_process_info.proc_session_dir && destroy_proc_session_dir) {
993+
opal_os_dirpath_destroy(opal_process_info.proc_session_dir,
994+
true, check_file);
995+
free(opal_process_info.proc_session_dir);
996+
opal_process_info.proc_session_dir = NULL;
997+
destroy_proc_session_dir = false;
998+
}
999+
9861000
if (NULL != opal_process_info.job_session_dir && destroy_job_session_dir) {
9871001
opal_os_dirpath_destroy(opal_process_info.job_session_dir,
988-
false, check_file);
1002+
true, check_file);
9891003
free(opal_process_info.job_session_dir);
9901004
opal_process_info.job_session_dir = NULL;
9911005
destroy_job_session_dir = false;
9921006
}
993-
994-
if (NULL != opal_process_info.top_session_dir) {
1007+
1008+
if (NULL != opal_process_info.top_session_dir && destroy_top_session_dir) {
1009+
opal_os_dirpath_destroy(opal_process_info.top_session_dir,
1010+
true, check_file);
9951011
free(opal_process_info.top_session_dir);
9961012
opal_process_info.top_session_dir = NULL;
997-
}
998-
999-
if (NULL != opal_process_info.proc_session_dir && destroy_proc_session_dir) {
1000-
opal_os_dirpath_destroy(opal_process_info.proc_session_dir,
1001-
false, check_file);
1002-
free(opal_process_info.proc_session_dir);
1003-
opal_process_info.proc_session_dir = NULL;
1004-
destroy_proc_session_dir = false;
1013+
destroy_top_session_dir = false;
10051014
}
10061015

10071016
if (NULL != opal_process_info.app_sizes) {
@@ -1165,27 +1174,45 @@ void ompi_rte_wait_for_debugger(void)
11651174

11661175
static int _setup_top_session_dir(char **sdir)
11671176
{
1177+
/*
1178+
* Use a session directory structure similar to prrte (create only one
1179+
* directory for the top session) so that it can be cleaned up correctly
1180+
* when terminated.
1181+
*/
11681182
char *tmpdir;
1183+
int rc;
1184+
uid_t uid = geteuid();
1185+
pid_t pid = getpid();
11691186

11701187
if( NULL == (tmpdir = getenv("TMPDIR")) )
11711188
if( NULL == (tmpdir = getenv("TEMP")) )
11721189
if( NULL == (tmpdir = getenv("TMP")) )
11731190
tmpdir = "/tmp";
11741191

1175-
*sdir = strdup(tmpdir);
1192+
if (0 > opal_asprintf(sdir, "%s/%s.%s.%lu.%lu",
1193+
tmpdir, "ompi",
1194+
opal_process_info.nodename,
1195+
(unsigned long)pid, (unsigned long) uid)) {
1196+
opal_process_info.top_session_dir = NULL;
1197+
return OPAL_ERR_OUT_OF_RESOURCE;
1198+
}
1199+
rc = opal_os_dirpath_create(opal_process_info.top_session_dir, 0755);
1200+
if (OPAL_SUCCESS != rc) {
1201+
// could not create top session dir
1202+
free(opal_process_info.top_session_dir);
1203+
opal_process_info.top_session_dir = NULL;
1204+
return rc;
1205+
}
1206+
destroy_top_session_dir = true;
11761207
return OPAL_SUCCESS;
11771208
}
11781209

11791210
static int _setup_job_session_dir(char **sdir)
11801211
{
11811212
int rc;
1182-
/* get the effective uid */
1183-
uid_t uid = geteuid();
11841213

1185-
if (0 > opal_asprintf(sdir, "%s/ompi.%s.%lu/jf.0/%u",
1214+
if (0 > opal_asprintf(sdir, "%s/%u",
11861215
opal_process_info.top_session_dir,
1187-
opal_process_info.nodename,
1188-
(unsigned long)uid,
11891216
opal_process_info.my_name.jobid)) {
11901217
opal_process_info.job_session_dir = NULL;
11911218
return OPAL_ERR_OUT_OF_RESOURCE;

opal/mca/btl/sm/btl_sm_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ static int mca_btl_sm_deregister_mem_knem(struct mca_btl_base_module_t *btl,
298298
return OPAL_SUCCESS;
299299
}
300300

301+
static void mca_btl_sm_component_finalize(void *data /*data unused*/) {
302+
opal_shmem_unlink(&mca_btl_sm_component.seg_ds);
303+
opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds);
304+
}
305+
301306
/*
302307
* SM component initialization
303308
*/
@@ -419,6 +424,12 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab
419424
/* set flag indicating btl not inited */
420425
mca_btl_sm.btl_inited = false;
421426

427+
/*
428+
* Use a method similar to `mca_btl_smcuda_component_init` to register segment finalize
429+
* to opal and release it before shmem is closed.
430+
*/
431+
opal_finalize_register_cleanup(mca_btl_sm_component_finalize);
432+
422433
return btls;
423434
failed:
424435
opal_shmem_unlink(&component->seg_ds);

opal/mca/btl/sm/btl_sm_module.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,6 @@ static int sm_finalize(struct mca_btl_base_module_t *btl)
347347
free(component->fbox_in_endpoints);
348348
component->fbox_in_endpoints = NULL;
349349

350-
opal_shmem_unlink(&mca_btl_sm_component.seg_ds);
351-
opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds);
352-
353350
return OPAL_SUCCESS;
354351
}
355352

0 commit comments

Comments
 (0)