Skip to content

Commit 238e0ee

Browse files
authored
Merge pull request open-mpi#6 from raafatfeki/pr/vulcan_sendbuf_contg
Pr/vulcan sendbuf contg
2 parents 3a6d5d1 + b72534a commit 238e0ee

File tree

1 file changed

+83
-75
lines changed

1 file changed

+83
-75
lines changed

ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c

Lines changed: 83 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
#define DEBUG_ON 0
3737
#define FCOLL_VULCAN_SHUFFLE_TAG 123
3838
#define INIT_LEN 10
39-
39+
#define NOT_AGGR_INDEX -1
4040

4141
/*Used for loading file-offsets per aggregator*/
4242
typedef struct mca_io_ompio_local_io_array{
@@ -57,13 +57,11 @@ typedef struct mca_io_ompio_aggregator_data {
5757
int current_index, current_position;
5858
int bytes_to_write_in_cycle, bytes_remaining, procs_per_group;
5959
int *procs_in_group, iov_index;
60-
bool sendbuf_is_contiguous, prev_sendbuf_is_contiguous;
6160
int bytes_sent, prev_bytes_sent;
6261
struct iovec *decoded_iov;
6362
int bytes_to_write, prev_bytes_to_write;
6463
mca_io_ompio_io_array_t *io_array, *prev_io_array;
6564
int num_io_entries, prev_num_io_entries;
66-
char *send_buf, *prev_send_buf;
6765
} mca_io_ompio_aggregator_data;
6866

6967

@@ -78,9 +76,7 @@ typedef struct mca_io_ompio_aggregator_data {
7876
for (_i=0; _i<_num; _i++ ) { \
7977
_aggr[_i]->prev_io_array=_aggr[_i]->io_array; \
8078
_aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \
81-
_aggr[_i]->prev_send_buf=_aggr[_i]->send_buf; \
8279
_aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent; \
83-
_aggr[_i]->prev_sendbuf_is_contiguous=_aggr[_i]->sendbuf_is_contiguous; \
8480
_aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \
8581
_t=_aggr[_i]->prev_global_buf; \
8682
_aggr[_i]->prev_global_buf=_aggr[_i]->global_buf; \
@@ -213,8 +209,6 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
213209
aggr_data[i]->procs_in_group = fh->f_procs_in_group;
214210
aggr_data[i]->comm = fh->f_comm;
215211
aggr_data[i]->buf = (char *)buf; // should not be used in the new version.
216-
aggr_data[i]->sendbuf_is_contiguous = false; //safe assumption for right now
217-
aggr_data[i]->prev_sendbuf_is_contiguous = false; //safe assumption for right now
218212
}
219213

220214
/*********************************************************************
@@ -544,6 +538,7 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
544538
#endif
545539
}
546540

541+
int aggr_index = NOT_AGGR_INDEX;
547542
reqs1 = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*vulcan_num_io_procs *sizeof(ompi_request_t *));
548543
reqs2 = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*vulcan_num_io_procs *sizeof(ompi_request_t *));
549544
if ( NULL == reqs1 || NULL == reqs2 ) {
@@ -567,6 +562,11 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
567562
for ( i=0; i<vulcan_num_io_procs; i++ ) {
568563
ret = shuffle_init ( 0, cycles, aggregators[i], fh->f_rank, aggr_data[i],
569564
&curr_reqs[i*(fh->f_procs_per_group + 1)] );
565+
566+
if(aggregators[i] == fh->f_rank) {
567+
aggr_index = i;
568+
}
569+
570570
if ( OMPI_SUCCESS != ret ) {
571571
goto exit;
572572
}
@@ -595,23 +595,19 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
595595
}
596596

597597

598-
/* Write data for iteration i-1 */
599-
for ( i=0; i<vulcan_num_io_procs; i++ ) {
598+
/* Write data for iteration i-1 only by an aggregator*/
599+
if(NOT_AGGR_INDEX != aggr_index) {
600600
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
601601
start_write_time = MPI_Wtime();
602602
#endif
603-
ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize );
603+
ret = write_init (fh, aggregators[aggr_index], aggr_data[aggr_index], write_chunksize );
604604
if (OMPI_SUCCESS != ret){
605605
goto exit;
606-
}
606+
}
607607
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
608608
end_write_time = MPI_Wtime();
609609
write_time += end_write_time - start_write_time;
610610
#endif
611-
612-
if (!aggr_data[i]->prev_sendbuf_is_contiguous && aggr_data[i]->prev_bytes_sent) {
613-
free (aggr_data[i]->prev_send_buf);
614-
}
615611
}
616612

617613
} /* end for (index = 0; index < cycles; index++) */
@@ -629,22 +625,18 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
629625
}
630626

631627
/* Write data for iteration i=cycles-1 */
632-
for ( i=0; i<vulcan_num_io_procs; i++ ) {
628+
if(NOT_AGGR_INDEX != aggr_index) {
633629
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
634630
start_write_time = MPI_Wtime();
635631
#endif
636-
ret = write_init (fh, aggregators[i], aggr_data[i], write_chunksize );
632+
ret = write_init (fh, aggregators[aggr_index], aggr_data[aggr_index], write_chunksize );
637633
if (OMPI_SUCCESS != ret){
638634
goto exit;
639-
}
635+
}
640636
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
641637
end_write_time = MPI_Wtime();
642638
write_time += end_write_time - start_write_time;
643639
#endif
644-
645-
if (!aggr_data[i]->prev_sendbuf_is_contiguous && aggr_data[i]->prev_bytes_sent) {
646-
free (aggr_data[i]->prev_send_buf);
647-
}
648640
}
649641
}
650642

@@ -742,7 +734,7 @@ static int write_init (mca_io_ompio_file_t *fh, int aggregator, mca_io_ompio_agg
742734
int last_pos=0;
743735

744736

745-
if ( aggregator == fh->f_rank && aggr_data->prev_num_io_entries) {
737+
if (aggr_data->prev_num_io_entries) {
746738
while ( aggr_data->prev_bytes_to_write > 0 ) {
747739
aggr_data->prev_bytes_to_write -= mca_fcoll_vulcan_split_iov_array (fh, aggr_data->prev_io_array,
748740
aggr_data->prev_num_io_entries,
@@ -779,11 +771,13 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
779771
MPI_Aint *memory_displacements=NULL;
780772
int *temp_disp_index=NULL;
781773
MPI_Aint global_count = 0;
774+
int* blocklength_proc=NULL;
775+
ptrdiff_t* displs_proc=NULL;
782776

783777
data->num_io_entries = 0;
784778
data->bytes_sent = 0;
785779
data->io_array=NULL;
786-
data->send_buf=NULL;
780+
787781
/**********************************************************************
788782
*** 7a. Getting ready for next cycle: initializing and freeing buffers
789783
**********************************************************************/
@@ -1158,74 +1152,86 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
11581152
}
11591153
} /* end if (entries_per_aggr > 0 ) */
11601154
}/* end if (aggregator == rank ) */
1161-
1162-
if ( data->sendbuf_is_contiguous ) {
1163-
data->send_buf = &((char*)data->buf)[data->total_bytes_written];
1164-
}
1165-
else if (bytes_sent) {
1166-
/* allocate a send buffer and copy the data that needs
1167-
to be sent into it in case the data is non-contigous
1168-
in memory */
1169-
ptrdiff_t mem_address;
1170-
size_t remaining = 0;
1171-
size_t temp_position = 0;
1172-
1173-
data->send_buf = malloc (bytes_sent);
1174-
if (NULL == data->send_buf) {
1155+
1156+
if (bytes_sent) {
1157+
size_t remaining = bytes_sent;
1158+
int block_index = -1;
1159+
int blocklength_size = INIT_LEN;
1160+
1161+
ptrdiff_t send_mem_address = NULL;
1162+
ompi_datatype_t *newType = MPI_DATATYPE_NULL;
1163+
blocklength_proc = (int *) calloc (blocklength_size, sizeof (int));
1164+
displs_proc = (ptrdiff_t *) calloc (blocklength_size, sizeof (ptrdiff_t));
1165+
1166+
if (NULL == blocklength_proc || NULL == displs_proc ) {
11751167
opal_output (1, "OUT OF MEMORY\n");
11761168
ret = OMPI_ERR_OUT_OF_RESOURCE;
11771169
goto exit;
11781170
}
1179-
1180-
remaining = bytes_sent;
1181-
1171+
11821172
while (remaining) {
1183-
mem_address = (ptrdiff_t)
1184-
(data->decoded_iov[data->iov_index].iov_base) + data->current_position;
1185-
1173+
block_index++;
1174+
1175+
if(0 == block_index) {
1176+
send_mem_address = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) +
1177+
data->current_position;
1178+
}
1179+
else {
1180+
// Reallocate more memory if blocklength_size is not enough
1181+
if(0 == block_index % INIT_LEN) {
1182+
blocklength_size += INIT_LEN;
1183+
blocklength_proc = (int *) realloc(blocklength_proc, blocklength_size * sizeof(int));
1184+
displs_proc = (ptrdiff_t *) realloc(displs_proc, blocklength_size * sizeof(ptrdiff_t));
1185+
}
1186+
displs_proc[block_index] = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) +
1187+
data->current_position - send_mem_address;
1188+
}
1189+
11861190
if (remaining >=
11871191
(data->decoded_iov[data->iov_index].iov_len - data->current_position)) {
1188-
memcpy (data->send_buf+temp_position,
1189-
(IOVBASE_TYPE *)mem_address,
1190-
data->decoded_iov[data->iov_index].iov_len - data->current_position);
1192+
1193+
blocklength_proc[block_index] = data->decoded_iov[data->iov_index].iov_len -
1194+
data->current_position;
11911195
remaining = remaining -
1192-
(data->decoded_iov[data->iov_index].iov_len - data->current_position);
1193-
temp_position = temp_position +
1194-
(data->decoded_iov[data->iov_index].iov_len - data->current_position);
1196+
(data->decoded_iov[data->iov_index].iov_len - data->current_position);
11951197
data->iov_index = data->iov_index + 1;
11961198
data->current_position = 0;
11971199
}
11981200
else {
1199-
memcpy (data->send_buf+temp_position,
1200-
(IOVBASE_TYPE *) mem_address,
1201-
remaining);
1201+
blocklength_proc[block_index] = remaining;
12021202
data->current_position += remaining;
12031203
remaining = 0;
12041204
}
12051205
}
1206-
}
1207-
data->total_bytes_written += bytes_sent;
1208-
data->bytes_sent = bytes_sent;
1209-
/* Gather the sendbuf from each process in appropritate locations in
1210-
aggregators*/
1211-
1212-
if (bytes_sent){
1213-
ret = MCA_PML_CALL(isend(data->send_buf,
1214-
bytes_sent,
1215-
MPI_BYTE,
1216-
aggregator,
1217-
FCOLL_VULCAN_SHUFFLE_TAG+index,
1218-
MCA_PML_BASE_SEND_STANDARD,
1219-
data->comm,
1220-
&reqs[data->procs_per_group]));
1221-
1222-
1223-
if ( OMPI_SUCCESS != ret ){
1224-
goto exit;
1206+
1207+
data->total_bytes_written += bytes_sent;
1208+
data->bytes_sent = bytes_sent;
1209+
1210+
if ( 0 <= block_index ) {
1211+
ompi_datatype_create_hindexed(block_index+1,
1212+
blocklength_proc,
1213+
displs_proc,
1214+
MPI_BYTE,
1215+
&newType);
1216+
ompi_datatype_commit(&newType);
1217+
1218+
ret = MCA_PML_CALL(isend((char *)send_mem_address,
1219+
1,
1220+
newType,
1221+
aggregator,
1222+
FCOLL_VULCAN_SHUFFLE_TAG+index,
1223+
MCA_PML_BASE_SEND_STANDARD,
1224+
data->comm,
1225+
&reqs[data->procs_per_group]));
1226+
if ( MPI_DATATYPE_NULL != newType ) {
1227+
ompi_datatype_destroy(&newType);
1228+
}
1229+
if (OMPI_SUCCESS != ret){
1230+
goto exit;
1231+
}
12251232
}
1226-
12271233
}
1228-
1234+
12291235
#if DEBUG_ON
12301236
if (aggregator == rank){
12311237
printf("************Cycle: %d, Aggregator: %d ***************\n",
@@ -1301,7 +1307,9 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
13011307
free(sorted_file_offsets);
13021308
free(file_offsets_for_agg);
13031309
free(memory_displacements);
1304-
1310+
free(blocklength_proc);
1311+
free(displs_proc);
1312+
13051313
return OMPI_SUCCESS;
13061314
}
13071315

0 commit comments

Comments
 (0)