diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index b5225017a59..03bd8761d5d 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -31,6 +31,8 @@ #if OPAL_ENABLE_DEBUG #include "opal/util/output.h" +#define vector_length 512/8 + #define DO_DEBUG(INST) if( opal_ddt_pack_debug ) { INST } #else #define DO_DEBUG(INST) @@ -292,8 +294,16 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, while( 1 ) { while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* we have a basic datatype (working on full blocks) */ - PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - conv_ptr, iov_ptr, iov_len_local ); + if (pElem->elem.blocklen < vector_length/2) + { + PACK_PREDEFINED_DATATYPE_AVX( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + } + else + { + PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + } if( 0 != count_desc ) /* completed? */ goto complete_loop; conv_ptr = pConvertor->pBaseBuf + pStack->disp; diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 2031a005e70..7109086c906 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -20,6 +20,10 @@ #define OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED #include "opal_config.h" +#include + +#define vector_length 512/8 + #include "opal/datatype/opal_datatype_pack_unpack_predefined.h" #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT @@ -166,6 +170,211 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, *(packed) = _packed; } +/* pack with avx gather */ +static inline void +pack_predefined_data_avx( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; + + assert( *(COUNT) <= total_count); //_elem->count * _elem->blocklen); + + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + DO_DEBUG( opal_output( 0, "Begin pack do_now:--%d total_count:--%d *(COUNT):--%d ", do_now, total_count, *(COUNT));); + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ + + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + cando_count -= do_now; + } + } + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + + /* each block can full fill vector, we use 4x version, or just copy once */ + if(vector_length < do_now_bytes || do_now == 1) { + DO_DEBUG( opal_output( 0, "block larger than VL or single copy(%d)(cando %d)(len %d)(extend %d)(do_now %d)", + do_now,cando_count,_elem->blocklen, _elem->extent,do_now_bytes); ); + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + *(packed) += do_now_bytes; + _memory += _elem->extent; + *(SPACE) -= do_now_bytes; + *(COUNT) -= _elem->blocklen; + cando_count -= _elem->blocklen; + } + } + /* each vector can deal with multi blocks */ + else { + DO_DEBUG( opal_output( 0, "block in bytes is smaller than VL in bytes, use gather_load version"); ); + /* how many full blocks can be processed in each vector */ + int blocks_in_VL = vector_length/do_now_bytes; + + /* cannot fullfill a whole vector to copy + * + * |__-|__-| + * + * |__|__|__|__| + * + */ + if(blocks_in_VL>do_now) { + blocks_in_VL = do_now; + } + + DO_DEBUG( opal_output( 0, "blength %d extend %d block bytes %d blocks %d do_now %d", + _elem->blocklen, _elem->extent, do_now_bytes, blocks_in_VL, do_now); ); + /* max VL 512/8/4 = 16 offsets for MPI_INT */ + uint32_t off_sets[16]; + int start = 0; + + /* get offsets for block items + * + * blocks "__":useful data; "-":no-copy + * |__-|__-|__-|__-| + * | / / + * | / / + * |__|__|__|__| + * + * get offsets for block items + */ + for(int j=0; jcommon.type]->size; i++) + { + /* offset in bytes offset=offset in block + extend(bytes)*j */ + off_sets[start] = (i+j*_elem->extent/opal_datatype_basicDatatypes[_elem->common.type]->size); + DO_DEBUG( opal_output( 0, "off_sets --%d",off_sets[start]);); + start++; + } + } + /* cannot totally fullfill the vector but almost full, best we can do */ + __mmask16 load_mask; + __m512i temp_src; + __m512i xt = _mm512_load_epi32((__m512*)off_sets);// _mm512_loadu_si512((__m512*)off_sets); +/* + uint32_t offs[256]; + for(int i=0; i<256; i=i+1) + { + offs[i]=i+1; + // DO_DEBUG( opal_output( 0, "--%d",offs[i]);); + } +*/ + /* loop thru how many vector copy need to do + * + * blocks "__":useful data; "-":no-copy + * |__-|__-|__-|__-|__-|__-|__-|__-|__-|__-| + * |-------VL------|-------VL------|--rem--| + * + */ + + int num_of_copys = cando_count/ (_elem->blocklen*blocks_in_VL); + load_mask = _cvtu32_mask16((1<<_elem->blocklen*blocks_in_VL)-1); + for(int i=0; i < num_of_copys; i++) + { + DO_DEBUG( opal_output( 0, "pack full VL. memcpy( %p, %p, %lu ) => space %lu copy seq %d copy(%d)(cando %d)(len %d)(extend %d)(do_now%d)" + ,(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes*blocks_in_VL, (unsigned long)*(SPACE), + i, do_now,cando_count,_elem->blocklen, _elem->extent, do_now); ); + //load_mask = _cvtu32_mask16((1<<_elem->blocklen*blocks_in_VL)-1); + //__m512i vsrc = _mm512_mask_i32gather_epi32 (temp_src, load_mask, xt, (void*)offs, 4); //_memory + __m512i vsrc = _mm512_mask_i32gather_epi32 (temp_src, load_mask, xt, (void*)_memory, 4); //_memory + //_mm512_mask_store_epi32( *(packed), load_mask, vsrc); + _mm512_store_epi32 (*(packed), vsrc); + /* + for(int i=0; i<_elem->blocklen*blocks_in_VL; i++) + { + DO_DEBUG( opal_output( 0, "No.%d -- %p %d %p %d ", i, *packed+i*4,*(*packed+i*4),_memory+i ,*(_memory+i));); + } + */ + *(packed) += do_now_bytes*blocks_in_VL; + _memory += _elem->extent*blocks_in_VL; + *(SPACE) -= do_now_bytes*blocks_in_VL; + *(COUNT) -= _elem->blocklen*blocks_in_VL; + cando_count -= _elem->blocklen*blocks_in_VL; + } + /* remaining blocks */ + blocks_in_VL = cando_count / _elem->blocklen; + DO_DEBUG( opal_output( 0, "Total ramining cando_count -- %d", cando_count);); + if (blocks_in_VL != 0) { + // Need to use mask OP for partial load & store + __mmask16 load_mask = _cvtu32_mask16((1<blocklen*blocks_in_VL; i++) + { + DO_DEBUG( opal_output( 0, "ramining No.%d -- %p %d %p %d ", i, *packed+i*4,*(*packed+i*4),_memory+i ,*(_memory+i));); + } + + + *(packed) += do_now_bytes*blocks_in_VL; + _memory += _elem->extent*blocks_in_VL; + *(SPACE) -= do_now_bytes*blocks_in_VL; + *(COUNT) -= _elem->blocklen*blocks_in_VL; + cando_count -= _elem->blocklen*blocks_in_VL; + } + } + } + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + } + + *(memory) = _memory - _elem->disp; +} + + static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, @@ -210,6 +419,14 @@ pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SP SPACE ) /* the space in the destination buffer */ \ pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) +#define PACK_PREDEFINED_DATATYPE_AVX( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +pack_predefined_data_avx( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) + #define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, MEMORY, PACKED, SPACE ) \ pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 6f9fdce2774..5b6dcc2b099 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -42,6 +42,8 @@ #include "opal/datatype/opal_datatype_unpack.h" #include "opal/datatype/opal_datatype_prototypes.h" +#define vector_length 512/8 + #if defined(CHECKSUM) #define opal_unpack_general_function opal_unpack_general_checksum #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum @@ -201,8 +203,16 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle #endif /* Then unpack the data into the user memory */ - UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - temporary_buffer, *user_buffer, data_length ); + if (pElem->elem.blocklen < vector_length/2) + { + UNPACK_PREDEFINED_DATATYPE_AVX( pConvertor, pElem, count_desc, + temporary_buffer, *user_buffer, data_length ); + } + else + { + UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + temporary_buffer, *user_buffer, data_length ); + } /* reload the length as it is reset by the macro */ data_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index a786a2fc7e9..bad443ddf51 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2019 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. @@ -19,6 +19,10 @@ #define OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED #include "opal_config.h" + +#include +#define vector_length 512/8 + #include "opal/datatype/opal_datatype_pack_unpack_predefined.h" #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT @@ -162,6 +166,166 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, *(packed) = _packed; } +/* avx sactter unpack */ +static inline void +unpack_predefined_data_avx( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t total_count = _elem->count * _elem->blocklen; + size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now, do_now_bytes; + unsigned char* _memory = (*memory) + _elem->disp; + + assert( *(COUNT) <= total_count);// _elem->count * _elem->blocklen); + + if( cando_count > *(COUNT) ) + cando_count = *(COUNT); + + /** + * First check if we already did something on this element ? + */ + do_now = (total_count - *(COUNT)); /* done elements */ + if( 0 != do_now ) { + do_now = do_now % _elem->blocklen; /* partial blocklen? */ + + if( 0 != do_now ) { + size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ + do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + cando_count -= do_now; + } + } + + /** + * Compute how many full blocklen we need to do and do them. + */ + do_now = cando_count / _elem->blocklen; + if( 0 != do_now ) { + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + /* each block can full fill vector, we use 4x version, or just copy once */ + if(vector_length < do_now_bytes || do_now == 1) { + DO_DEBUG( opal_output( 5, "unpack block in bytes is larger than VL in bytes, use 4x version"); ); + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + *(packed) += do_now_bytes; + _memory += _elem->extent; + *(SPACE) -= do_now_bytes; + *(COUNT) -= _elem->blocklen; + cando_count -= _elem->blocklen; + } + } + /* each vector can deal with multi blocks */ + else { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + + DO_DEBUG( opal_output( 0, "unpack: block in bytes is smaller than VL in bytes, use scatter_store version"); ); + /* how many full blocks can be processed in each vector */ + int blocks_in_VL = vector_length/do_now_bytes/4; + + if(blocks_in_VL>do_now) { + blocks_in_VL = do_now; + } + DO_DEBUG( opal_output( 0, "unpack blength %d extend %d block bytes %d blocks %d do_now %d", + _elem->blocklen, _elem->extent, do_now_bytes, blocks_in_VL, do_now); ); + + uint32_t off_sets[256]; + int start = 0; + for(int j=0; jextent/opal_datatype_basicDatatypes[_elem->common.type]->size); + start++; + } + } + __mmask16 load_mask; + __m512i temp_src; + __m512i xt = _mm512_loadu_si512((__m512*)off_sets); + + int num_of_copys = cando_count/ (_elem->blocklen*blocks_in_VL); + load_mask = _cvtu32_mask16((1<<_elem->blocklen*blocks_in_VL)-1); + + for(int i=0; i < num_of_copys; i++) + { + __m512i vsrc = _mm512_mask_load_epi32 (temp_src, load_mask, *(packed)); + _mm512_mask_i32scatter_epi32 (_memory, load_mask, xt, vsrc, 4); + /* + for(int i=0; i<_elem->extent*blocks_in_VL; i=i+4) + { + DO_DEBUG( opal_output( 0, "un-- %p %d %p %d ",*packed+i,*(*packed+i),_memory+i ,*(_memory+i));); + } + */ + *(packed) += do_now_bytes*blocks_in_VL; + _memory += _elem->extent*blocks_in_VL; + *(SPACE) -= do_now_bytes*blocks_in_VL; + *(COUNT) -= _elem->blocklen*blocks_in_VL; + cando_count -= _elem->blocklen*blocks_in_VL; + } + + /* remaining blocks */ + blocks_in_VL = cando_count / _elem->blocklen; + if (blocks_in_VL != 0) { + __mmask16 load_mask = _cvtu32_mask16((1<extent*blocks_in_VL; i=i+4) + { + DO_DEBUG( opal_output( 0, "remain un-- %p %d %p %d ",*packed+i,*(*packed+i),_memory+i ,*(_memory+i));); + } + */ + *(packed) += do_now_bytes*blocks_in_VL; + _memory += _elem->extent*blocks_in_VL; + *(SPACE) -= do_now_bytes*blocks_in_VL; + *(COUNT) -= _elem->blocklen*blocks_in_VL; + cando_count -= _elem->blocklen*blocks_in_VL; + } + } + } + + /** + * As an epilog do anything left from the last blocklen. + */ + do_now = cando_count; + if( 0 != do_now ) { + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + _memory += do_now_bytes; + *(packed) += do_now_bytes; + *(SPACE) -= do_now_bytes; + *(COUNT) -= do_now; + } + + *(memory) = _memory - _elem->disp; +} + static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, size_t* COUNT, @@ -206,6 +370,14 @@ unpack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &( SPACE ) /* the space in the destination buffer */ \ unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) +#define UNPACK_PREDEFINED_DATATYPE_AVX( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_predefined_data_avx( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) + #define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) )