Better sin/cos LUT implementations

vloncar · vloncar · commit 8a94f4a95d3e · 2023-02-21T16:59:38.000+01:00
diff --git a/hls4ml/backends/symbolic/passes/expr_templates.py b/hls4ml/backends/symbolic/passes/expr_templates.py
@@ -13,11 +13,19 @@
 
 expr_include_list = ['hls_math.h', 'nnet_utils/nnet_math.h']
 
+built_in_luts = ['sin_lut', 'cos_lut']
+
 class HLSCodePrinter(CXX11CodePrinter):
     _ns = 'hls::'
 
-    def __init__(self, layer, lut_functions, settings=None):
+    def __init__(self, layer, lut_functions, use_built_in_luts=False, settings=None):
         if lut_functions is not None:
+            if use_built_in_luts:
+                # Check if user's LUTs override built-in LUTs
+                for lut_name in lut_functions.keys():
+                    if lut_name in built_in_luts:
+                        print(f'WARNING: User-specified LUT function {lut_name} overrides built-in LUT function.')
+
             if settings is None:
                 settings = { 'user_functions': lut_functions }
             else:
@@ -27,6 +35,7 @@ def __init__(self, layer, lut_functions, settings=None):
 
         super().__init__(settings)
         self.layer = layer
+        self.use_built_in_luts = use_built_in_luts
 
         for k in ('Abs Sqrt exp exp2 expm1 log log10 log2 log1p Cbrt hypot fma'
           ' loggamma sin cos tan asin acos atan atan2 sinh cosh tanh asinh acosh '
@@ -82,7 +91,14 @@ def _print_math(self, expr):
         cast = f'({hls_type.name})'
         args = ', '.join(map(lambda arg: self._print(arg), expr.args))
 
-        return f'{self._ns}{name}{template}({cast}({args}))'
+        if self.use_built_in_luts and name + '_lut' in built_in_luts:
+            ns = 'nnet::'
+            name = name + '_lut'
+            template = f'<{hls_type.name}>'
+        else:
+            ns = self._ns
+
+        return f'{ns}{name}{template}({cast}({args}))'
 
     def _print_Symbol(self, expr):
         name = super()._print_Symbol(expr)
@@ -96,7 +112,8 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
 
-        printer = HLSCodePrinter(node, lut_functions={ lut_fun.name : lut_fun.name for lut_fun in params['lut_functions'] })
+        lut_functions = { lut_fun.name : lut_fun.name for lut_fun in params['lut_functions'] }
+        printer = HLSCodePrinter(node, lut_functions=lut_functions, use_built_in_luts=node.attributes['use_built_in_luts'])
 
         fn_templates = []
         for i, expr in enumerate(node.attributes['expression']):
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
@@ -400,6 +400,7 @@ def convert_from_symbolic_expression(
     expr,
     n_symbols=None,
     lut_functions=None,
+    use_built_in_lut_functions=False,
     output_dir='my-hls-test',
     project_name='myproject',
     input_data_tb=None,
@@ -440,6 +441,7 @@ def convert_from_symbolic_expression(
     expr_layer['expression'] = [str(e) for e in expr]
     expr_layer['n_symbols'] = n_symbols
     expr_layer['lut_functions'] = lut_functions
+    expr_layer['use_built_in_luts'] = use_built_in_lut_functions
     layer_list.append(expr_layer)
 
     config = create_config(
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_math.h b/hls4ml/templates/vivado/nnet_utils/nnet_math.h
@@ -45,10 +45,155 @@ T atan(T x) {
 };
 
 template<typename T>
-T atan2(T x) {
-    return (T) hls::atan2(x);
+T atan2(T x, T y) {
+    return (T) hls::atan2(x, y);
 };
 
+template<class T, int W, int I>
+void init_sincos_table(T table[1<<(W - I - 3)][2]) {
+    unsigned int NTE = 1<<(W - I - 3); //No of table entries
+    double step = M_PI/(4*NTE);        //Interval between angles
+    double y = 0;
+    //double scaled_angle = 0;
+
+    for (unsigned int i=0; i < NTE; i++) {
+        table[i][0] = std::cos(y);
+        table[i][1] = std::sin(y);
+        y += step;
+        //scaled_angle = y/(2*M_PI);
+        //printf("cos(%f) = %23.22f, sin(%f) = %23.22f index = %d, scaled angle = %13.12f \n", y, cos(y), y, sin(y), i, scaled_angle);
+    }
+
+}
+
+template<class T>
+void sincos_lut(const T &input, T output[2]) {
+
+    #pragma HLS INLINE
+
+    static bool flag = true;
+    if (flag && T::width-T::iwidth > 12) {
+        #if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+        std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+        std::cout << "Warning: The output of sincos_lut will not be accurate" << std::endl;
+        #endif
+        flag = false;
+    }
+    // Datatype for lookup table entries
+    typedef ap_ufixed <T::width, T::iwidth, AP_RND> luttype;
+    // Datatype for posinput which is used to handle negative inputs
+    typedef ap_ufixed<T::width-T::iwidth, 0> posinputtype;
+
+    typedef ap_uint<9> lutindextype; // 9 bits required for indexing into 512 entry table
+    typedef ap_uint<3> octanttype;   // 3 bits required for octant value range of 0 thru 7
+    T outputtemp[2];
+    lutindextype luTdex = 0;
+    posinputtype posinput = input;
+
+    // Initialize the lookup table
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    luttype sincos[512][2];
+#else
+    static bool initialized = false;
+    static luttype sincos[512][2];
+#endif
+    if (!initialized) {
+        init_sincos_table<luttype, 12, 0>(sincos);
+        initialized = true;
+    }
+
+    // Leaving this commented out makes the table to to BRAM
+    //#pragma HLS ARRAY_PARTITION variable=sincos complete dim=0
+
+    typedef ap_uint<AP_MAX(T::width-T::iwidth-3, 1)> lutindextype1;
+    // Extracting (MSB-3:LSB) bits of scaled input to determine the lookup table index
+    lutindextype1 luTdex1 = posinput.range(AP_MAX(T::width-T::iwidth-3, 1), 0); // Extracting the lookup table index
+
+    if (T::width-T::iwidth>=4 && T::width-T::iwidth<=12) {
+        luTdex(8, 12- (T::width - T::iwidth)) = luTdex1; // stride
+    }
+    //Approximation for the scaled inputs whose number of bits are greater than 12
+    else if (T::width-T::iwidth>12) {
+        // Lookup table index for the scaled inputs whose number of bits are greater than 12
+        luTdex = luTdex1/(1<<(AP_MAX(T::width-T::iwidth-12, 0)));
+        if ((luTdex1 % (1<<(AP_MAX(T::width-T::iwidth-12,0)))) > (1<<(AP_MAX(T::width-T::iwidth-13,0)))) {
+            luTdex = luTdex + 1;
+        }
+        typedef ap_ufixed<AP_MAX((AP_MAX(T::width-T::iwidth-3, 1) + T::width-T::iwidth-12), 1), AP_MAX(T::width-T::iwidth-3, 1)> datatype;
+        datatype x = (datatype)luTdex1;
+        x = x >> AP_MAX(T::width-T::iwidth-12, 0);
+        if (x > 511.5) { luTdex = 511; }
+        if (luTdex1 <= 1<<(AP_MAX(T::width-T::iwidth-13,0)) && luTdex1 != 0) { luTdex = 1; }
+    }
+
+    if (T::width-T::iwidth>=3) {
+        // Getting the octant 0-7 by extracting the first 3 bits from MSB side of scaled input where
+        //   octant 0 corresponds to [0-PI/4),
+        //   octant 1 corresponds to [PI/4-2PI/4),
+        //   octant 2 corresponds to [2PI/4-3PI/4) and so on
+        //octanttype octant = posinput.template slc<3>(T::width-T::iwidth-3);
+        octanttype octant = posinput(T::width-T::iwidth-1, T::width-T::iwidth-3);
+        luTdex = (octant[0] == 1)?(lutindextype)(512-luTdex):(lutindextype)(luTdex);
+        // imaginary part is sine
+        outputtemp[1] = ((octant==0) | (octant==3)) ? (T) sincos[luTdex][1]:
+                        ((octant==2) | (octant==1)) ? (T) sincos[luTdex][0]:
+                        ((octant==7) | (octant==4)) ? (T)-sincos[luTdex][1]:
+                        (T)-sincos[luTdex][0];
+        // real part is cosine
+        outputtemp[0] = ((octant==6) | (octant==1)) ? (T) sincos[luTdex][1]:
+                        ((octant==3) | (octant==4)) ? (T)-sincos[luTdex][0]:
+                        ((octant==2) | (octant==5)) ? (T)-sincos[luTdex][1]:
+                        (T) sincos[luTdex][0];
+        // Below two are the cases when the output corresponds to + or - (0 or 1) for which there is no entry in the lookup table
+        output[1] = ((posinput==0.125) | (posinput==0.375)) ? T( 0.7071067811865475244008):
+                    ((posinput==0.625) | (posinput==0.875)) ? T(-0.7071067811865475244008):
+                    outputtemp[1];
+        output[0] = ((posinput==0.125) | (posinput==0.875)) ? T( 0.7071067811865475244008):
+                    ((posinput==0.375) | (posinput==0.625)) ? T(-0.7071067811865475244008):
+                    outputtemp[0];
+    }
+
+    if (T::width-T::iwidth <= 2) {
+        output[1] = (posinput==0   ) ? (T) 0:
+                    (posinput==0.25) ? (T) 1:
+                    (posinput==0.5 ) ? (T) 0:
+                    (posinput==0.75) ? (T)-1:
+                    outputtemp[1];
+        output[0] = (posinput==0   ) ? (T) 1:
+                    (posinput==0.25) ? (T) 0:
+                    (posinput==0.5 ) ? (T)-1:
+                    (posinput==0.75) ? (T) 0:
+                    outputtemp[0];
+    }
+
+    #if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+    std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+    std::cout << "============AP_FIXED SINCOS======================" << std::endl;
+    std::cout << "positive input is   = " << posinput << std::endl;
+    std::cout << "lut index is   = " << luTdex << std::endl;
+    std::cout << "sin value is    = " << output[1] << std::endl;
+    std::cout << "cos value is    = " << output[0] << std::endl;
+    std::cout << "=================================================" << std::endl;
+    #endif
+}
+
+template<class T>
+T sin_lut(const T input) {
+    T sincos_res[2];
+    T scaled_input = input * ap_ufixed<16,0>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[1];
+}
+
+template<class T>
+T cos_lut(const T input) {
+    T sincos_res[2];
+    T scaled_input = input * ap_ufixed<16,0>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[0];
+}
+
 }
 
-#endif
+#endif