@@ -61,11 +61,133 @@ void micros_overflow_tick(void* arg) {
61
61
micros_at_last_overflow_tick = m ;
62
62
}
63
63
64
- unsigned long ICACHE_RAM_ATTR millis () {
65
- uint32_t m = system_get_time ();
66
- uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick ) ? 1 : 0 );
67
- return c * 4294967 + m / 1000 ;
68
- }
64
+ //---------------------------------------------------------------------------
65
+ // millis() 'magic multiplier' approximation
66
+ //
67
+ // This function corrects the cumlative (296us / usec overflow) drift
68
+ // seen in the orignal 'millis()' function.
69
+ //
70
+ // Input:
71
+ // 'm' - 32-bit usec counter, 0 <= m <= 0xFFFFFFFF
72
+ // 'c' - 32-bit usec overflow counter 0 <= c < 0x00400000
73
+ // Output:
74
+ // Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
75
+ //
76
+ // Notes:
77
+ //
78
+ // 1) This routine approximates the 64-bit integer division,
79
+ //
80
+ // quotient = ( 2^32 c + m ) / 1000,
81
+ //
82
+ // through the use of 'magic' multipliers. A slow division is replaced by
83
+ // a faster multiply using a scaled multiplicative inverse of the divisor:
84
+ //
85
+ // quotient =~ ( 2^32 c + m ) * k, where k = Ceiling[ 2^n / 1000 ]
86
+ //
87
+ // The precision difference between multiplier and divisor sets the
88
+ // upper-bound of the dividend which can be successfully divided.
89
+ //
90
+ // For this application, n = 64, and the divisor (1000) has 10-bits of
91
+ // precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
92
+ // and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
93
+ // for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
94
+ //
95
+ // 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
96
+ //
97
+ // prd = (2^32 kh + kl) * ( 2^32 c + m )
98
+ // = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
99
+ // (d) (c) (b) (a)
100
+ //
101
+ // Graphically, the offset-sums align in little endian like this:
102
+ // LS -> MS
103
+ // 32 64 96 128
104
+ // | a[-1] | a[0] | a[1] | a[2] |
105
+ // | m kl | 0 | 0 | a[-1] not needed
106
+ // | | m kh | |
107
+ // | | c kl | | a[1] holds the result
108
+ // | | | c kh | a[2] can be discarded
109
+ //
110
+ // As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
111
+ // overall result, only (2) 32-bit words are needed for the accumulator.
112
+ //
113
+ // 3) As C++ does not intrinsically test for addition overflows, one must
114
+ // code specifically to detect them. This approximation skips these
115
+ // overflow checks for speed, hence the sum,
116
+ //
117
+ // highword( m kl ) + m kh + c kl < (2^64-1), MUST NOT OVERFLOW.
118
+ //
119
+ // To meet this criteria, not only do we have to pick 'k' to achieve our
120
+ // desired precision, we also have to split 'k' appropriately to avoid
121
+ // any addition overflows.
122
+ //
123
+ // 'k' should be also chosen to align the various products on byte
124
+ // boundaries to avoid any 64-bit shifts before additions, as they incur
125
+ // major time penalties. The 'k' chosen for this specific division by 1000
126
+ // was picked primarily to avoid shifts as well as for precision.
127
+ //
128
+ // For the reasons list above, this routine is NOT a general one.
129
+ // Changing divisors could break the overflow requirement and force
130
+ // picking a 'k' split which requires shifts before additions.
131
+ //
132
+ // ** Test THOROUGHLY after making changes **
133
+ //
134
+ // 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
135
+ //
136
+ // usec x Orig Comment
137
+ // Orig: 3.18 1.00 Original code
138
+ // Corr: 13.21 4.15 64-bit reference code
139
+ // Test: 4.60 1.45 64-bit magic multiply, 4x32
140
+ //
141
+ // The magic multiplier routine runs ~3x faster than the reference. Execution
142
+ // times can vary considerably with the numbers being multiplied, so one
143
+ // should derate this factor to around 2x, worst case.
144
+ //
145
+ // Reference function: corrected millis(), 64-bit arithmetic,
146
+ // truncated to 32-bits by return
147
+ // unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
148
+ // {
149
+ // // Get usec system time, usec overflow conter
150
+ // ......
151
+ // return ( (c * 4294967296 + m) / 1000 ); // 64-bit division is SLOW
152
+ // } //millis_corr
153
+ //
154
+ // 5) See this link for a good discussion on magic multipliers:
155
+ // http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
156
+ //
157
+
158
+ #define MAGIC_1E3_wLO 0x4bc6a7f0 // LS part
159
+ #define MAGIC_1E3_wHI 0x00418937 // MS part, magic multiplier
160
+
161
+ unsigned long ICACHE_RAM_ATTR millis ()
162
+ {
163
+ union {
164
+ uint64_t q ; // Accumulator, 64-bit, little endian
165
+ uint32_t a [2 ]; // ..........., 32-bit segments
166
+ } acc ;
167
+ acc .a [1 ] = 0 ; // Zero high-acc
168
+
169
+ // Get usec system time, usec overflow counter
170
+ uint32_t m = system_get_time ();
171
+ uint32_t c = micros_overflow_count +
172
+ ((m < micros_at_last_overflow_tick ) ? 1 : 0 );
173
+
174
+ // (a) Init. low-acc with high-word of 1st product. The right-shift
175
+ // falls on a byte boundary, hence is relatively quick.
176
+
177
+ acc .q = ( (uint64_t )( m * (uint64_t )MAGIC_1E3_wLO ) >> 32 );
178
+
179
+ // (b) Offset sum, low-acc
180
+ acc .q += ( m * (uint64_t )MAGIC_1E3_wHI );
181
+
182
+ // (c) Offset sum, low-acc
183
+ acc .q += ( c * (uint64_t )MAGIC_1E3_wLO );
184
+
185
+ // (d) Truncated sum, high-acc
186
+ acc .a [1 ] += (uint32_t )( c * (uint64_t )MAGIC_1E3_wHI );
187
+
188
+ return ( acc .a [1 ] ); // Extract result, high-acc
189
+
190
+ } //millis
69
191
70
192
unsigned long ICACHE_RAM_ATTR micros () {
71
193
return system_get_time ();
0 commit comments