Skip to content

Commit 40ee5d2

Browse files
committed
Fix singleton tmp files cleanup
Only in singleton mode, directory cleaning needs to be done by the program itself. There are some problems with these parts of the code that cause the directory to not be cleaned. This commit fixes *some of* these issues. 1. btl/sm will not unlink its segments file. We never noticed this in non-singleton mode because pmix cleaned it up for us. After this, we can clean up the segment file created by sm in /dev/shm.(when singletons normally terminated) 2. Modified the singleton session directory structure and enabled recursive deletion. After this, we can cleanup the session dir. (when singletons normally terminated) 3. Fix a bug - local peer number of a singleton should be 0, not 1. After this, the btl/sm and btl/smcuda components will return NULL during their init process and will be automatically closed. btl/sm segment file in /dev/shm will never be created in singleton mode now. Signed-off-by: xbw <[email protected]>
1 parent c66b0c7 commit 40ee5d2

File tree

3 files changed

+46
-16
lines changed

3 files changed

+46
-16
lines changed

ompi/runtime/ompi_rte.c

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
6868
* infrastructure that manages its structure (e.g., OpenPMIx). If we setup this
6969
* session directory structure, then we shall cleanup after ourselves.
7070
*/
71+
static bool destroy_top_session_dir = false;
7172
static bool destroy_job_session_dir = false;
7273
static bool destroy_proc_session_dir = false;
7374

@@ -983,22 +984,25 @@ int ompi_rte_finalize(void)
983984
{
984985

985986
/* cleanup the session directory we created */
987+
if (NULL != opal_process_info.top_session_dir && destroy_top_session_dir) {
988+
opal_os_dirpath_destroy(opal_process_info.top_session_dir,
989+
true, check_file);
990+
free(opal_process_info.top_session_dir);
991+
opal_process_info.top_session_dir = NULL;
992+
destroy_top_session_dir = false;
993+
}
994+
986995
if (NULL != opal_process_info.job_session_dir && destroy_job_session_dir) {
987996
opal_os_dirpath_destroy(opal_process_info.job_session_dir,
988-
false, check_file);
997+
true, check_file);
989998
free(opal_process_info.job_session_dir);
990999
opal_process_info.job_session_dir = NULL;
9911000
destroy_job_session_dir = false;
9921001
}
9931002

994-
if (NULL != opal_process_info.top_session_dir) {
995-
free(opal_process_info.top_session_dir);
996-
opal_process_info.top_session_dir = NULL;
997-
}
998-
9991003
if (NULL != opal_process_info.proc_session_dir && destroy_proc_session_dir) {
10001004
opal_os_dirpath_destroy(opal_process_info.proc_session_dir,
1001-
false, check_file);
1005+
true, check_file);
10021006
free(opal_process_info.proc_session_dir);
10031007
opal_process_info.proc_session_dir = NULL;
10041008
destroy_proc_session_dir = false;
@@ -1165,27 +1169,45 @@ void ompi_rte_wait_for_debugger(void)
11651169

11661170
static int _setup_top_session_dir(char **sdir)
11671171
{
1172+
/*
1173+
* Use a session directory structure similar to prrte (create only one
1174+
* directory for the top session) so that it can be cleaned up correctly
1175+
* when terminated.
1176+
*/
11681177
char *tmpdir;
1178+
int rc;
1179+
uid_t uid = geteuid();
1180+
pid_t pid = getpid();
11691181

11701182
if( NULL == (tmpdir = getenv("TMPDIR")) )
11711183
if( NULL == (tmpdir = getenv("TEMP")) )
11721184
if( NULL == (tmpdir = getenv("TMP")) )
11731185
tmpdir = "/tmp";
11741186

1175-
*sdir = strdup(tmpdir);
1187+
if (0 > opal_asprintf(sdir, "%s/%s.%s.%lu.%lu",
1188+
tmpdir, "ompi",
1189+
opal_process_info.nodename,
1190+
(unsigned long)pid, (unsigned long) uid)) {
1191+
opal_process_info.top_session_dir = NULL;
1192+
return OPAL_ERR_OUT_OF_RESOURCE;
1193+
}
1194+
rc = opal_os_dirpath_create(opal_process_info.top_session_dir, 0755);
1195+
if (OPAL_SUCCESS != rc) {
1196+
// could not create top session dir
1197+
free(opal_process_info.top_session_dir);
1198+
opal_process_info.top_session_dir = NULL;
1199+
return rc;
1200+
}
1201+
destroy_top_session_dir = true;
11761202
return OPAL_SUCCESS;
11771203
}
11781204

11791205
static int _setup_job_session_dir(char **sdir)
11801206
{
11811207
int rc;
1182-
/* get the effective uid */
1183-
uid_t uid = geteuid();
11841208

1185-
if (0 > opal_asprintf(sdir, "%s/ompi.%s.%lu/jf.0/%u",
1209+
if (0 > opal_asprintf(sdir, "%s/%u",
11861210
opal_process_info.top_session_dir,
1187-
opal_process_info.nodename,
1188-
(unsigned long)uid,
11891211
opal_process_info.my_name.jobid)) {
11901212
opal_process_info.job_session_dir = NULL;
11911213
return OPAL_ERR_OUT_OF_RESOURCE;

opal/mca/btl/sm/btl_sm_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ static int mca_btl_sm_deregister_mem_knem(struct mca_btl_base_module_t *btl,
298298
return OPAL_SUCCESS;
299299
}
300300

301+
static void mca_btl_sm_component_finalize(void *data /*data unused*/) {
302+
opal_shmem_unlink(&mca_btl_sm_component.seg_ds);
303+
opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds);
304+
}
305+
301306
/*
302307
* SM component initialization
303308
*/
@@ -419,6 +424,12 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab
419424
/* set flag indicating btl not inited */
420425
mca_btl_sm.btl_inited = false;
421426

427+
/*
428+
* Use a method similar to `mca_btl_smcuda_component_init` to register segment finalize
429+
* to opal and release it before shmem is closed.
430+
*/
431+
opal_finalize_register_cleanup(mca_btl_sm_component_finalize);
432+
422433
return btls;
423434
failed:
424435
opal_shmem_unlink(&component->seg_ds);

opal/mca/btl/sm/btl_sm_module.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,6 @@ static int sm_finalize(struct mca_btl_base_module_t *btl)
347347
free(component->fbox_in_endpoints);
348348
component->fbox_in_endpoints = NULL;
349349

350-
opal_shmem_unlink(&mca_btl_sm_component.seg_ds);
351-
opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds);
352-
353350
return OPAL_SUCCESS;
354351
}
355352

0 commit comments

Comments
 (0)