36
36
#define DEBUG_ON 0
37
37
#define FCOLL_VULCAN_SHUFFLE_TAG 123
38
38
#define INIT_LEN 10
39
-
39
+ #define NOT_AGGR_INDEX -1
40
40
41
41
/*Used for loading file-offsets per aggregator*/
42
42
typedef struct mca_io_ompio_local_io_array {
@@ -57,13 +57,11 @@ typedef struct mca_io_ompio_aggregator_data {
57
57
int current_index , current_position ;
58
58
int bytes_to_write_in_cycle , bytes_remaining , procs_per_group ;
59
59
int * procs_in_group , iov_index ;
60
- bool sendbuf_is_contiguous , prev_sendbuf_is_contiguous ;
61
60
int bytes_sent , prev_bytes_sent ;
62
61
struct iovec * decoded_iov ;
63
62
int bytes_to_write , prev_bytes_to_write ;
64
63
mca_io_ompio_io_array_t * io_array , * prev_io_array ;
65
64
int num_io_entries , prev_num_io_entries ;
66
- char * send_buf , * prev_send_buf ;
67
65
} mca_io_ompio_aggregator_data ;
68
66
69
67
@@ -78,9 +76,7 @@ typedef struct mca_io_ompio_aggregator_data {
78
76
for (_i=0; _i<_num; _i++ ) { \
79
77
_aggr[_i]->prev_io_array=_aggr[_i]->io_array; \
80
78
_aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \
81
- _aggr[_i]->prev_send_buf=_aggr[_i]->send_buf; \
82
79
_aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent; \
83
- _aggr[_i]->prev_sendbuf_is_contiguous=_aggr[_i]->sendbuf_is_contiguous; \
84
80
_aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \
85
81
_t=_aggr[_i]->prev_global_buf; \
86
82
_aggr[_i]->prev_global_buf=_aggr[_i]->global_buf; \
@@ -213,8 +209,6 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
213
209
aggr_data [i ]-> procs_in_group = fh -> f_procs_in_group ;
214
210
aggr_data [i ]-> comm = fh -> f_comm ;
215
211
aggr_data [i ]-> buf = (char * )buf ; // should not be used in the new version.
216
- aggr_data [i ]-> sendbuf_is_contiguous = false; //safe assumption for right now
217
- aggr_data [i ]-> prev_sendbuf_is_contiguous = false; //safe assumption for right now
218
212
}
219
213
220
214
/*********************************************************************
@@ -544,6 +538,7 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
544
538
#endif
545
539
}
546
540
541
+ int aggr_index = NOT_AGGR_INDEX ;
547
542
reqs1 = (ompi_request_t * * )malloc ((fh -> f_procs_per_group + 1 )* vulcan_num_io_procs * sizeof (ompi_request_t * ));
548
543
reqs2 = (ompi_request_t * * )malloc ((fh -> f_procs_per_group + 1 )* vulcan_num_io_procs * sizeof (ompi_request_t * ));
549
544
if ( NULL == reqs1 || NULL == reqs2 ) {
@@ -567,6 +562,11 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
567
562
for ( i = 0 ; i < vulcan_num_io_procs ; i ++ ) {
568
563
ret = shuffle_init ( 0 , cycles , aggregators [i ], fh -> f_rank , aggr_data [i ],
569
564
& curr_reqs [i * (fh -> f_procs_per_group + 1 )] );
565
+
566
+ if (aggregators [i ] == fh -> f_rank ) {
567
+ aggr_index = i ;
568
+ }
569
+
570
570
if ( OMPI_SUCCESS != ret ) {
571
571
goto exit ;
572
572
}
@@ -595,23 +595,19 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
595
595
}
596
596
597
597
598
- /* Write data for iteration i-1 */
599
- for ( i = 0 ; i < vulcan_num_io_procs ; i ++ ) {
598
+ /* Write data for iteration i-1 only by an aggregator */
599
+ if ( NOT_AGGR_INDEX != aggr_index ) {
600
600
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
601
601
start_write_time = MPI_Wtime ();
602
602
#endif
603
- ret = write_init (fh , aggregators [i ], aggr_data [i ], write_chunksize );
603
+ ret = write_init (fh , aggregators [aggr_index ], aggr_data [aggr_index ], write_chunksize );
604
604
if (OMPI_SUCCESS != ret ){
605
605
goto exit ;
606
- }
606
+ }
607
607
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
608
608
end_write_time = MPI_Wtime ();
609
609
write_time += end_write_time - start_write_time ;
610
610
#endif
611
-
612
- if (!aggr_data [i ]-> prev_sendbuf_is_contiguous && aggr_data [i ]-> prev_bytes_sent ) {
613
- free (aggr_data [i ]-> prev_send_buf );
614
- }
615
611
}
616
612
617
613
} /* end for (index = 0; index < cycles; index++) */
@@ -629,22 +625,18 @@ int mca_fcoll_vulcan_file_write_all (mca_io_ompio_file_t *fh,
629
625
}
630
626
631
627
/* Write data for iteration i=cycles-1 */
632
- for ( i = 0 ; i < vulcan_num_io_procs ; i ++ ) {
628
+ if ( NOT_AGGR_INDEX != aggr_index ) {
633
629
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
634
630
start_write_time = MPI_Wtime ();
635
631
#endif
636
- ret = write_init (fh , aggregators [i ], aggr_data [i ], write_chunksize );
632
+ ret = write_init (fh , aggregators [aggr_index ], aggr_data [aggr_index ], write_chunksize );
637
633
if (OMPI_SUCCESS != ret ){
638
634
goto exit ;
639
- }
635
+ }
640
636
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
641
637
end_write_time = MPI_Wtime ();
642
638
write_time += end_write_time - start_write_time ;
643
639
#endif
644
-
645
- if (!aggr_data [i ]-> prev_sendbuf_is_contiguous && aggr_data [i ]-> prev_bytes_sent ) {
646
- free (aggr_data [i ]-> prev_send_buf );
647
- }
648
640
}
649
641
}
650
642
@@ -742,7 +734,7 @@ static int write_init (mca_io_ompio_file_t *fh, int aggregator, mca_io_ompio_agg
742
734
int last_pos = 0 ;
743
735
744
736
745
- if ( aggregator == fh -> f_rank && aggr_data -> prev_num_io_entries ) {
737
+ if (aggr_data -> prev_num_io_entries ) {
746
738
while ( aggr_data -> prev_bytes_to_write > 0 ) {
747
739
aggr_data -> prev_bytes_to_write -= mca_fcoll_vulcan_split_iov_array (fh , aggr_data -> prev_io_array ,
748
740
aggr_data -> prev_num_io_entries ,
@@ -779,11 +771,13 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
779
771
MPI_Aint * memory_displacements = NULL ;
780
772
int * temp_disp_index = NULL ;
781
773
MPI_Aint global_count = 0 ;
774
+ int * blocklength_proc = NULL ;
775
+ ptrdiff_t * displs_proc = NULL ;
782
776
783
777
data -> num_io_entries = 0 ;
784
778
data -> bytes_sent = 0 ;
785
779
data -> io_array = NULL ;
786
- data -> send_buf = NULL ;
780
+
787
781
/**********************************************************************
788
782
*** 7a. Getting ready for next cycle: initializing and freeing buffers
789
783
**********************************************************************/
@@ -1158,74 +1152,86 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
1158
1152
}
1159
1153
} /* end if (entries_per_aggr > 0 ) */
1160
1154
}/* end if (aggregator == rank ) */
1161
-
1162
- if ( data -> sendbuf_is_contiguous ) {
1163
- data -> send_buf = & ((char * )data -> buf )[data -> total_bytes_written ];
1164
- }
1165
- else if (bytes_sent ) {
1166
- /* allocate a send buffer and copy the data that needs
1167
- to be sent into it in case the data is non-contigous
1168
- in memory */
1169
- ptrdiff_t mem_address ;
1170
- size_t remaining = 0 ;
1171
- size_t temp_position = 0 ;
1172
-
1173
- data -> send_buf = malloc (bytes_sent );
1174
- if (NULL == data -> send_buf ) {
1155
+
1156
+ if (bytes_sent ) {
1157
+ size_t remaining = bytes_sent ;
1158
+ int block_index = -1 ;
1159
+ int blocklength_size = INIT_LEN ;
1160
+
1161
+ ptrdiff_t send_mem_address = NULL ;
1162
+ ompi_datatype_t * newType = MPI_DATATYPE_NULL ;
1163
+ blocklength_proc = (int * ) calloc (blocklength_size , sizeof (int ));
1164
+ displs_proc = (ptrdiff_t * ) calloc (blocklength_size , sizeof (ptrdiff_t ));
1165
+
1166
+ if (NULL == blocklength_proc || NULL == displs_proc ) {
1175
1167
opal_output (1 , "OUT OF MEMORY\n" );
1176
1168
ret = OMPI_ERR_OUT_OF_RESOURCE ;
1177
1169
goto exit ;
1178
1170
}
1179
-
1180
- remaining = bytes_sent ;
1181
-
1171
+
1182
1172
while (remaining ) {
1183
- mem_address = (ptrdiff_t )
1184
- (data -> decoded_iov [data -> iov_index ].iov_base ) + data -> current_position ;
1185
-
1173
+ block_index ++ ;
1174
+
1175
+ if (0 == block_index ) {
1176
+ send_mem_address = (ptrdiff_t ) (data -> decoded_iov [data -> iov_index ].iov_base ) +
1177
+ data -> current_position ;
1178
+ }
1179
+ else {
1180
+ // Reallocate more memory if blocklength_size is not enough
1181
+ if (0 == block_index % INIT_LEN ) {
1182
+ blocklength_size += INIT_LEN ;
1183
+ blocklength_proc = (int * ) realloc (blocklength_proc , blocklength_size * sizeof (int ));
1184
+ displs_proc = (ptrdiff_t * ) realloc (displs_proc , blocklength_size * sizeof (ptrdiff_t ));
1185
+ }
1186
+ displs_proc [block_index ] = (ptrdiff_t ) (data -> decoded_iov [data -> iov_index ].iov_base ) +
1187
+ data -> current_position - send_mem_address ;
1188
+ }
1189
+
1186
1190
if (remaining >=
1187
1191
(data -> decoded_iov [data -> iov_index ].iov_len - data -> current_position )) {
1188
- memcpy ( data -> send_buf + temp_position ,
1189
- ( IOVBASE_TYPE * ) mem_address ,
1190
- data -> decoded_iov [ data -> iov_index ]. iov_len - data -> current_position ) ;
1192
+
1193
+ blocklength_proc [ block_index ] = data -> decoded_iov [ data -> iov_index ]. iov_len -
1194
+ data -> current_position ;
1191
1195
remaining = remaining -
1192
- (data -> decoded_iov [data -> iov_index ].iov_len - data -> current_position );
1193
- temp_position = temp_position +
1194
- (data -> decoded_iov [data -> iov_index ].iov_len - data -> current_position );
1196
+ (data -> decoded_iov [data -> iov_index ].iov_len - data -> current_position );
1195
1197
data -> iov_index = data -> iov_index + 1 ;
1196
1198
data -> current_position = 0 ;
1197
1199
}
1198
1200
else {
1199
- memcpy (data -> send_buf + temp_position ,
1200
- (IOVBASE_TYPE * ) mem_address ,
1201
- remaining );
1201
+ blocklength_proc [block_index ] = remaining ;
1202
1202
data -> current_position += remaining ;
1203
1203
remaining = 0 ;
1204
1204
}
1205
1205
}
1206
- }
1207
- data -> total_bytes_written += bytes_sent ;
1208
- data -> bytes_sent = bytes_sent ;
1209
- /* Gather the sendbuf from each process in appropritate locations in
1210
- aggregators*/
1211
-
1212
- if (bytes_sent ){
1213
- ret = MCA_PML_CALL (isend (data -> send_buf ,
1214
- bytes_sent ,
1215
- MPI_BYTE ,
1216
- aggregator ,
1217
- FCOLL_VULCAN_SHUFFLE_TAG + index ,
1218
- MCA_PML_BASE_SEND_STANDARD ,
1219
- data -> comm ,
1220
- & reqs [data -> procs_per_group ]));
1221
-
1222
-
1223
- if ( OMPI_SUCCESS != ret ){
1224
- goto exit ;
1206
+
1207
+ data -> total_bytes_written += bytes_sent ;
1208
+ data -> bytes_sent = bytes_sent ;
1209
+
1210
+ if ( 0 <= block_index ) {
1211
+ ompi_datatype_create_hindexed (block_index + 1 ,
1212
+ blocklength_proc ,
1213
+ displs_proc ,
1214
+ MPI_BYTE ,
1215
+ & newType );
1216
+ ompi_datatype_commit (& newType );
1217
+
1218
+ ret = MCA_PML_CALL (isend ((char * )send_mem_address ,
1219
+ 1 ,
1220
+ newType ,
1221
+ aggregator ,
1222
+ FCOLL_VULCAN_SHUFFLE_TAG + index ,
1223
+ MCA_PML_BASE_SEND_STANDARD ,
1224
+ data -> comm ,
1225
+ & reqs [data -> procs_per_group ]));
1226
+ if ( MPI_DATATYPE_NULL != newType ) {
1227
+ ompi_datatype_destroy (& newType );
1228
+ }
1229
+ if (OMPI_SUCCESS != ret ){
1230
+ goto exit ;
1231
+ }
1225
1232
}
1226
-
1227
1233
}
1228
-
1234
+
1229
1235
#if DEBUG_ON
1230
1236
if (aggregator == rank ){
1231
1237
printf ("************Cycle: %d, Aggregator: %d ***************\n" ,
@@ -1301,7 +1307,9 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
1301
1307
free (sorted_file_offsets );
1302
1308
free (file_offsets_for_agg );
1303
1309
free (memory_displacements );
1304
-
1310
+ free (blocklength_proc );
1311
+ free (displs_proc );
1312
+
1305
1313
return OMPI_SUCCESS ;
1306
1314
}
1307
1315
0 commit comments