Skip to content

Commit 8f6a839

Browse files
committed
mpi: retain operation and datatype in non blocking collectives
MPI standard states a user MPI_Op and/or user MPI_Datatype can be free'd after a call to a non blocking collective and before the non-blocking collective completes. Retain user (only) MPI_Op and MPI_Datatype when the non blocking call is invoked, and set a request callback so they are free'd when the MPI_Request completes. Thanks Thomas Ponweiser for reporting this Fixes #2151 Fixes #1304 Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent 163bbd4 commit 8f6a839

23 files changed

+306
-57
lines changed

ompi/mca/coll/base/coll_base_util.c

Lines changed: 162 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2017 Research Organization for Information Science
13-
* and Technology (RIST). All rights reserved.
12+
* Copyright (c) 2014-2019 Research Organization for Information Science
13+
* and Technology (RIST). All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -29,6 +29,27 @@
2929
#include "ompi/mca/pml/pml.h"
3030
#include "coll_base_util.h"
3131

32+
struct retain_op_data {
33+
ompi_request_complete_fn_t req_complete_cb;
34+
void *req_complete_cb_data;
35+
ompi_op_t *op;
36+
ompi_datatype_t *datatype;
37+
};
38+
39+
struct retain_datatypes_data {
40+
ompi_request_complete_fn_t req_complete_cb;
41+
void *req_complete_cb_data;
42+
ompi_datatype_t *stype;
43+
ompi_datatype_t *rtype;
44+
};
45+
46+
struct retain_datatypes_w_data {
47+
ompi_request_complete_fn_t req_complete_cb;
48+
void *req_complete_cb_data;
49+
int count;
50+
ompi_datatype_t *types[];
51+
};
52+
3253
int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
3354
ompi_datatype_t* sdatatype,
3455
int dest, int stag,
@@ -103,3 +124,142 @@ int ompi_rounddown(int num, int factor)
103124
num /= factor;
104125
return num * factor; /* floor(num / factor) * factor */
105126
}
127+
128+
static int release_op_callback(struct ompi_request_t *request) {
129+
struct retain_op_data * p = (struct retain_op_data *)request->req_complete_cb_data;
130+
int rc = OMPI_SUCCESS;
131+
assert (NULL != p);
132+
if (NULL != p->req_complete_cb) {
133+
request->req_complete_cb = p->req_complete_cb;
134+
request->req_complete_cb_data = p->req_complete_cb_data;
135+
rc = request->req_complete_cb(request);
136+
}
137+
if (NULL != p->op) {
138+
OBJ_RELEASE(p->op);
139+
}
140+
if (NULL != p->datatype) {
141+
OBJ_RELEASE(p->datatype);
142+
}
143+
free(p);
144+
return rc;
145+
}
146+
147+
int ompi_coll_base_retain_op( ompi_request_t *request, ompi_op_t *op,
148+
ompi_datatype_t *type) {
149+
bool retain = !ompi_op_is_intrinsic(op);
150+
retain |= !ompi_datatype_is_predefined(type);
151+
if (OPAL_UNLIKELY(retain)) {
152+
struct retain_op_data *p = (struct retain_op_data *)calloc(1, sizeof(struct retain_op_data));
153+
if (OPAL_UNLIKELY(NULL == p)) {
154+
return OMPI_ERR_OUT_OF_RESOURCE;
155+
}
156+
if (!ompi_op_is_intrinsic(op)) {
157+
OBJ_RETAIN(op);
158+
p->op = op;
159+
}
160+
if (!ompi_datatype_is_predefined(type)) {
161+
OBJ_RETAIN(type);
162+
p->datatype = type;
163+
}
164+
p->req_complete_cb = request->req_complete_cb;
165+
p->req_complete_cb_data = request->req_complete_cb_data;
166+
request->req_complete_cb = release_op_callback;
167+
request->req_complete_cb_data = p;
168+
}
169+
return OMPI_SUCCESS;
170+
}
171+
172+
static int release_datatypes_callback(struct ompi_request_t *request) {
173+
struct retain_datatypes_data * p = (struct retain_datatypes_data *)request->req_complete_cb_data;
174+
int rc = OMPI_SUCCESS;
175+
assert (NULL != p);
176+
if (NULL != p->req_complete_cb) {
177+
request->req_complete_cb = p->req_complete_cb;
178+
request->req_complete_cb_data = p->req_complete_cb_data;
179+
rc = request->req_complete_cb(request);
180+
}
181+
if (NULL != p->stype) {
182+
OBJ_RELEASE(p->stype);
183+
}
184+
if (NULL != p->rtype) {
185+
OBJ_RELEASE(p->rtype);
186+
}
187+
free(p);
188+
return rc;
189+
}
190+
191+
int ompi_coll_base_retain_datatypes( ompi_request_t *request, ompi_datatype_t *stype,
192+
ompi_datatype_t *rtype) {
193+
bool retain = NULL != stype && !ompi_datatype_is_predefined(stype);
194+
retain |= NULL != rtype && !ompi_datatype_is_predefined(rtype);
195+
if (OPAL_UNLIKELY(retain)) {
196+
struct retain_datatypes_data *p = (struct retain_datatypes_data *)calloc(1, sizeof(struct retain_datatypes_data));
197+
if (OPAL_UNLIKELY(NULL == p)) {
198+
return OMPI_ERR_OUT_OF_RESOURCE;
199+
}
200+
if (NULL != stype && !ompi_datatype_is_predefined(stype)) {
201+
OBJ_RETAIN(stype);
202+
p->stype = stype;
203+
}
204+
if (NULL != rtype && !ompi_datatype_is_predefined(rtype)) {
205+
OBJ_RETAIN(rtype);
206+
p->rtype = rtype;
207+
}
208+
p->req_complete_cb = request->req_complete_cb;
209+
p->req_complete_cb_data = request->req_complete_cb_data;
210+
request->req_complete_cb = release_datatypes_callback;
211+
request->req_complete_cb_data = p;
212+
}
213+
return OMPI_SUCCESS;
214+
}
215+
216+
static int release_datatypes_w_callback(struct ompi_request_t *request) {
217+
struct retain_datatypes_w_data * p = (struct retain_datatypes_w_data *)request->req_complete_cb_data;
218+
int rc = OMPI_SUCCESS;
219+
assert (NULL != p);
220+
if (NULL != p->req_complete_cb) {
221+
request->req_complete_cb = p->req_complete_cb;
222+
request->req_complete_cb_data = p->req_complete_cb_data;
223+
rc = request->req_complete_cb(request);
224+
}
225+
for (int i=0; i<p->count; i++) {
226+
OBJ_RELEASE(p->types[i]);
227+
}
228+
free(p);
229+
return rc;
230+
}
231+
232+
int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, int count,
233+
ompi_datatype_t *const stypes[], ompi_datatype_t *const rtypes[]) {
234+
int datatypes = 0;
235+
for (int i=0; i<count; i++) {
236+
if (NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
237+
datatypes++;
238+
}
239+
if (NULL != rtypes[i] && !ompi_datatype_is_predefined(rtypes[i])) {
240+
datatypes++;
241+
}
242+
}
243+
if (OPAL_UNLIKELY(0 < datatypes)) {
244+
struct retain_datatypes_w_data *p = (struct retain_datatypes_w_data *)calloc(1, sizeof(struct retain_datatypes_data)+(datatypes-1)*sizeof(ompi_datatype_t *));
245+
if (OPAL_UNLIKELY(NULL == p)) {
246+
return OMPI_ERR_OUT_OF_RESOURCE;
247+
}
248+
datatypes = 0;
249+
for (int i=0; i<count; i++) {
250+
if (NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
251+
p->types[datatypes++] = stypes[i];
252+
OBJ_RETAIN(stypes[i]);
253+
}
254+
if (NULL != rtypes[i] && !ompi_datatype_is_predefined(rtypes[i])) {
255+
p->types[datatypes++] = rtypes[i];
256+
OBJ_RETAIN(rtypes[i]);
257+
}
258+
}
259+
p->req_complete_cb = request->req_complete_cb;
260+
p->req_complete_cb_data = request->req_complete_cb_data;
261+
request->req_complete_cb = release_datatypes_w_callback;
262+
request->req_complete_cb_data = p;
263+
}
264+
return OMPI_SUCCESS;
265+
}

ompi/mca/coll/base/coll_base_util.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2017 Research Organization for Information Science
13-
* and Technology (RIST). All rights reserved.
12+
* Copyright (c) 2014-2019 Research Organization for Information Science
13+
* and Technology (RIST). All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -27,6 +27,7 @@
2727
#include "ompi/mca/mca.h"
2828
#include "ompi/datatype/ompi_datatype.h"
2929
#include "ompi/request/request.h"
30+
#include "ompi/op/op.h"
3031
#include "ompi/mca/pml/pml.h"
3132

3233
BEGIN_C_DECLS
@@ -84,5 +85,15 @@ unsigned int ompi_mirror_perm(unsigned int x, int nbits);
8485
*/
8586
int ompi_rounddown(int num, int factor);
8687

88+
int ompi_coll_base_retain_op( ompi_request_t *request, ompi_op_t *op,
89+
ompi_datatype_t *type);
90+
91+
int ompi_coll_base_retain_datatypes( ompi_request_t *request, ompi_datatype_t *stype,
92+
ompi_datatype_t *rtype);
93+
94+
int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, int count,
95+
ompi_datatype_t *const stypes[],
96+
ompi_datatype_t *const rtypes[]);
97+
8798
END_C_DECLS
8899
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

ompi/mpi/c/iallgather.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved.
1515
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2015 Research Organization for Information Science
18-
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2015-2019 Research Organization for Information Science
18+
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -31,6 +31,7 @@
3131
#include "ompi/communicator/communicator.h"
3232
#include "ompi/errhandler/errhandler.h"
3333
#include "ompi/datatype/ompi_datatype.h"
34+
#include "ompi/mca/coll/base/coll_base_util.h"
3435
#include "ompi/memchecker.h"
3536
#include "ompi/runtime/ompi_spc.h"
3637

@@ -102,6 +103,9 @@ int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
102103
err = comm->c_coll->coll_iallgather(sendbuf, sendcount, sendtype,
103104
recvbuf, recvcount, recvtype, comm,
104105
request, comm->c_coll->coll_iallgather_module);
106+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
107+
ompi_coll_base_retain_datatypes(*request, sendtype, recvtype);
108+
}
105109

106110
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
107111
}

ompi/mpi/c/iallgatherv.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2015 Research Organization for Information Science
18-
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2015-2019 Research Organization for Information Science
18+
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -31,6 +31,7 @@
3131
#include "ompi/communicator/communicator.h"
3232
#include "ompi/errhandler/errhandler.h"
3333
#include "ompi/datatype/ompi_datatype.h"
34+
#include "ompi/mca/coll/base/coll_base_util.h"
3435
#include "ompi/memchecker.h"
3536
#include "ompi/runtime/ompi_spc.h"
3637

@@ -126,6 +127,9 @@ int MPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
126127
recvbuf, recvcounts, displs,
127128
recvtype, comm, request,
128129
comm->c_coll->coll_iallgatherv_module);
130+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
131+
ompi_coll_base_retain_datatypes(*request, sendtype, recvtype);
132+
}
129133
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
130134
}
131135

ompi/mpi/c/iallreduce.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
* All rights reserved.
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015 Research Organization for Information Science
16-
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2015-2019 Research Organization for Information Science
16+
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1818
* $COPYRIGHT$
1919
*
@@ -31,6 +31,7 @@
3131
#include "ompi/errhandler/errhandler.h"
3232
#include "ompi/datatype/ompi_datatype.h"
3333
#include "ompi/op/op.h"
34+
#include "ompi/mca/coll/base/coll_base_util.h"
3435
#include "ompi/memchecker.h"
3536
#include "ompi/runtime/ompi_spc.h"
3637

@@ -112,10 +113,11 @@ int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count,
112113

113114
/* Invoke the coll component to perform the back-end operation */
114115

115-
OBJ_RETAIN(op);
116116
err = comm->c_coll->coll_iallreduce(sendbuf, recvbuf, count, datatype,
117117
op, comm, request, comm->c_coll->coll_iallreduce_module);
118-
OBJ_RELEASE(op);
118+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
119+
ompi_coll_base_retain_op(*request, op, datatype);
120+
}
119121
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
120122
}
121123

ompi/mpi/c/ialltoall.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved.
1515
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2014-2016 Research Organization for Information Science
18-
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2014-2019 Research Organization for Information Science
18+
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -31,6 +31,7 @@
3131
#include "ompi/communicator/communicator.h"
3232
#include "ompi/errhandler/errhandler.h"
3333
#include "ompi/datatype/ompi_datatype.h"
34+
#include "ompi/mca/coll/base/coll_base_util.h"
3435
#include "ompi/memchecker.h"
3536
#include "ompi/runtime/ompi_spc.h"
3637

@@ -101,5 +102,8 @@ int MPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
101102
err = comm->c_coll->coll_ialltoall(sendbuf, sendcount, sendtype,
102103
recvbuf, recvcount, recvtype, comm,
103104
request, comm->c_coll->coll_ialltoall_module);
105+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
106+
ompi_coll_base_retain_datatypes(*request, sendtype, recvtype);
107+
}
104108
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
105109
}

ompi/mpi/c/ialltoallv.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2014-2016 Research Organization for Information Science
17-
* and Technology (RIST). All rights reserved.
16+
* Copyright (c) 2014-2019 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -30,6 +30,7 @@
3030
#include "ompi/communicator/communicator.h"
3131
#include "ompi/errhandler/errhandler.h"
3232
#include "ompi/datatype/ompi_datatype.h"
33+
#include "ompi/mca/coll/base/coll_base_util.h"
3334
#include "ompi/memchecker.h"
3435
#include "ompi/runtime/ompi_spc.h"
3536

@@ -130,6 +131,9 @@ int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispl
130131
err = comm->c_coll->coll_ialltoallv(sendbuf, sendcounts, sdispls,
131132
sendtype, recvbuf, recvcounts, rdispls,
132133
recvtype, comm, request, comm->c_coll->coll_ialltoallv_module);
134+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
135+
ompi_coll_base_retain_datatypes(*request, sendtype, recvtype);
136+
}
133137
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
134138
}
135139

ompi/mpi/c/ialltoallw.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2014-2016 Research Organization for Information Science
17-
* and Technology (RIST). All rights reserved.
16+
* Copyright (c) 2014-2019 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -30,6 +30,7 @@
3030
#include "ompi/communicator/communicator.h"
3131
#include "ompi/errhandler/errhandler.h"
3232
#include "ompi/datatype/ompi_datatype.h"
33+
#include "ompi/mca/coll/base/coll_base_util.h"
3334
#include "ompi/memchecker.h"
3435
#include "ompi/runtime/ompi_spc.h"
3536

@@ -127,6 +128,12 @@ int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispl
127128
sendtypes, recvbuf, recvcounts,
128129
rdispls, recvtypes, comm, request,
129130
comm->c_coll->coll_ialltoallw_module);
131+
if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
132+
ompi_coll_base_retain_datatypes_w(*request,
133+
OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm),
134+
sendtypes,
135+
recvtypes);
136+
}
130137
OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
131138
}
132139

0 commit comments

Comments
 (0)