Skip to content

Commit 082dff3

Browse files
IMF LA open-sourcing. FP64 cospi. (#15)
Co-authored-by: Jacek Jankowski <[email protected]>
1 parent a900121 commit 082dff3

File tree

2 files changed

+295
-4
lines changed

2 files changed

+295
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
/*===================== begin_copyright_notice ==================================
2+
3+
Copyright (c) 2017 Intel Corporation
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a
6+
copy of this software and associated documentation files (the
7+
"Software"), to deal in the Software without restriction, including
8+
without limitation the rights to use, copy, modify, merge, publish,
9+
distribute, sublicense, and/or sell copies of the Software, and to
10+
permit persons to whom the Software is furnished to do so, subject to
11+
the following conditions:
12+
13+
The above copyright notice and this permission notice shall be included
14+
in all copies or substantial portions of the Software.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17+
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23+
24+
25+
======================= end_copyright_notice ==================================*/
26+
27+
#include "../imf.h"
28+
#pragma OPENCL FP_CONTRACT OFF
29+
typedef struct
30+
{
31+
unsigned long _dAbsMask;
32+
unsigned long _dHalf;
33+
unsigned long _dSignBit;
34+
unsigned long _dC1_h;
35+
unsigned long _dC2;
36+
unsigned long _dC3;
37+
unsigned long _dC4;
38+
unsigned long _dC5;
39+
unsigned long _dC6;
40+
unsigned long _dC7;
41+
unsigned long _dC8;
42+
unsigned long _dC9;
43+
} __internal_dcospi_la_data_avx512_t;
44+
static __constant __internal_dcospi_la_data_avx512_t __internal_dcospi_la_data_avx512 = {
45+
0x7FFFFFFFFFFFFFFFuL,
46+
0x3fe0000000000000uL,
47+
0x8000000000000000uL,
48+
0xc00921fb54442d18uL,
49+
0x4014abbce625be52uL,
50+
0xc00466bc6775aa7duL,
51+
0x3fe32d2cce627c9euL,
52+
0xbfb50783485523f4uL,
53+
0x3f7e3074dfb5bb14uL,
54+
0xbf3e8f3677c334d3uL,
55+
0x3ef6f7ad23b5cd51uL,
56+
0xbea9d46b06ce620euL,
57+
};
58+
59+
typedef struct
60+
{
61+
unsigned long _dAbsMask;
62+
unsigned long _dReductionRangeVal;
63+
unsigned long _dRangeVal;
64+
unsigned long _dRShifter;
65+
unsigned long _dOneHalf;
66+
67+
unsigned long _dC0;
68+
69+
unsigned long _dC1;
70+
unsigned long _dC2;
71+
unsigned long _dC3;
72+
unsigned long _dC4;
73+
74+
unsigned long _dC5;
75+
unsigned long _dC6;
76+
unsigned long _dC7;
77+
unsigned long _dC8;
78+
unsigned long _dThres;
79+
unsigned long _dSgnExp;
80+
81+
} __internal_dcospi_la_data_t;
82+
static __constant __internal_dcospi_la_data_t __internal_dcospi_la_data = {
83+
0x7FFFFFFFFFFFFFFFuL,
84+
0x42A0000000000000uL,
85+
0x7FF0000000000000uL,
86+
0x4338000000000000uL,
87+
0x3FE0000000000000uL,
88+
89+
0x400921fb54442d18uL,
90+
0xc014abbce625be52uL,
91+
0x400466bc6775aa7duL,
92+
0xbfe32d2cce627c9euL,
93+
0x3fb50783485523f4uL,
94+
0xbf7e3074dfb5bb14uL,
95+
0x3f3e8f3677c334d3uL,
96+
0xbef6f7ad23b5cd51uL,
97+
0x3ea9d46b06ce620euL,
98+
99+
};
100+
101+
static __constant _iml_v2_dp_union_t __dcospi_la_CoutTab[3] = {
102+
0x00000000, 0x00000000,
103+
0x00000000, 0x7FF00000,
104+
};
105+
106+
#pragma float_control(push)
107+
#pragma float_control(precise,on)
108+
__attribute__((always_inline))
109+
inline int __internal_dcospi_la_cout (double *a, double *r)
110+
{
111+
double absx;
112+
int nRet = 0;
113+
114+
absx = (*a);
115+
(((_iml_v2_dp_union_t *) & absx)->dwords.hi_dword = (((_iml_v2_dp_union_t *) & absx)->dwords.hi_dword & 0x7FFFFFFF) | ((_iml_uint32_t) (0) << 31));
116+
if (!(((((_iml_v2_dp_union_t *) & (*a))->dwords.hi_dword >> 20) & 0x7FF) != 0x7FF))
117+
{
118+
119+
if ((((_iml_v2_dp_union_t *) & (absx))->hex[0] == ((__constant _iml_v2_dp_union_t *) & (((__constant double *) __dcospi_la_CoutTab)[1]))->hex[0])
120+
&& (((_iml_v2_dp_union_t *) & (absx))->hex[1] ==
121+
((__constant _iml_v2_dp_union_t *) & (((__constant double *) __dcospi_la_CoutTab)[1]))->hex[1]))
122+
{
123+
124+
(*r) = (*a) * ((__constant double *) __dcospi_la_CoutTab)[0];
125+
126+
nRet = 1;
127+
}
128+
else
129+
{
130+
131+
(*r) = ((*a) + (*a));
132+
}
133+
}
134+
return nRet;
135+
}
136+
137+
#pragma float_control(pop)
138+
double __ocl_svml_cospi (double a)
139+
{
140+
141+
double va1;
142+
double vr1;
143+
unsigned int vm;
144+
145+
double r;
146+
147+
va1 = a;;
148+
149+
{
150+
151+
double dX;
152+
double dAbsX;
153+
double dExp;
154+
double dRangeMask;
155+
unsigned long lRangeMask;
156+
double dReductionRangeMask;
157+
unsigned long lReductionRangeMask;
158+
unsigned int mReductionRangeMask;
159+
double dSign;
160+
double dSignReduced;
161+
double dSignRes;
162+
double dN;
163+
double dY;
164+
double dR;
165+
double dRp2;
166+
double dPoly;
167+
168+
double dAbsMask;
169+
double dRangeVal;
170+
double dReductionRangeVal;
171+
double dRShifter;
172+
double dInvPI;
173+
double dPI;
174+
double dHalfPI;
175+
double dOneHalf;
176+
double dC0;
177+
double dPiToRad;
178+
179+
double dC1;
180+
double dC2;
181+
double dC3;
182+
double dC4;
183+
double dC5;
184+
double dC6;
185+
double dC7;
186+
double dC8;
187+
188+
dX = va1;
189+
190+
vm = 0;
191+
192+
dAbsMask = as_double (__internal_dcospi_la_data._dAbsMask);
193+
dAbsX = as_double ((as_ulong (dX) & as_ulong (dAbsMask)));
194+
195+
dR = dAbsX;
196+
197+
dReductionRangeVal = as_double (__internal_dcospi_la_data._dReductionRangeVal);
198+
dReductionRangeMask = as_double ((unsigned long) (((!(dAbsX <= dReductionRangeVal)) ? 0xffffffffffffffff : 0x0)));
199+
lReductionRangeMask = as_ulong (dReductionRangeMask);
200+
mReductionRangeMask = 0;
201+
mReductionRangeMask = lReductionRangeMask;
202+
if ((mReductionRangeMask) != 0)
203+
{
204+
205+
dRangeVal = as_double (__internal_dcospi_la_data._dRangeVal);
206+
dExp = as_double ((as_ulong (dRangeVal) & as_ulong (dAbsX)));
207+
dRangeMask = as_double ((unsigned long) ((dExp == dRangeVal) ? 0xffffffffffffffff : 0x0));
208+
lRangeMask = as_ulong (dRangeMask);
209+
vm = 0;
210+
vm = lRangeMask;
211+
212+
{
213+
214+
double dX;
215+
double dShifterMod;
216+
double dShifterThreshold;
217+
double dShifterMask;
218+
double dShifterPos;
219+
double dShifter;
220+
double dDirect;
221+
double dInverse;
222+
double dShiftedN;
223+
double dN;
224+
double dZero;
225+
dX = dAbsX;
226+
dShifterThreshold = as_double (0x43A0000000000000uL);
227+
dShifterMask = as_double ((unsigned long) ((dX < dShifterThreshold) ? 0xffffffffffffffff : 0x0));
228+
229+
dShifterPos = as_double (0x43B8000000000000uL);
230+
dZero = as_double (0x0000000000000000uL);
231+
dShifter = as_double ((((~as_ulong (dShifterMask)) & as_ulong (dZero)) | (as_ulong (dShifterMask) & as_ulong (dShifterPos))));
232+
233+
dShiftedN = (dShifter + dAbsX);
234+
dN = (dShiftedN - dShifter);
235+
dR = (dAbsX - dN);
236+
237+
}
238+
239+
dR = as_double ((((~as_ulong (dReductionRangeMask)) & as_ulong (dAbsX)) | (as_ulong (dReductionRangeMask) & as_ulong (dR))));
240+
}
241+
242+
dOneHalf = as_double (__internal_dcospi_la_data._dOneHalf);
243+
dX = (dR + dOneHalf);
244+
245+
dRShifter = as_double (__internal_dcospi_la_data._dRShifter);
246+
dY = (dX + dRShifter);
247+
dN = (dY - dRShifter);
248+
249+
dSignRes = as_double (((unsigned long) as_ulong (dY) << (63)));
250+
251+
dN = (dN - dOneHalf);
252+
dR = (dR - dN);
253+
254+
dR = as_double ((as_ulong (dR) ^ as_ulong (dSignRes)));
255+
dRp2 = (dR * dR);
256+
257+
dC8 = as_double (__internal_dcospi_la_data._dC8);
258+
dC7 = as_double (__internal_dcospi_la_data._dC7);
259+
dC6 = as_double (__internal_dcospi_la_data._dC6);
260+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dC8, dRp2, dC7);
261+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC6);
262+
dC5 = as_double (__internal_dcospi_la_data._dC5);
263+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC5);
264+
dC4 = as_double (__internal_dcospi_la_data._dC4);
265+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC4);
266+
dC3 = as_double (__internal_dcospi_la_data._dC3);
267+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC3);
268+
269+
dC2 = as_double (__internal_dcospi_la_data._dC2);
270+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC2);
271+
dC1 = as_double (__internal_dcospi_la_data._dC1);
272+
dPoly = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dRp2, dC1);
273+
dC0 = as_double (__internal_dcospi_la_data._dC0);
274+
dC0 = (dC0 * dR);
275+
dPoly = (dPoly * dRp2);
276+
277+
vr1 = __builtin_spirv_OpenCL_fma_f64_f64_f64 (dPoly, dR, dC0);
278+
}
279+
280+
if ((vm) != 0)
281+
{
282+
double _vapi_arg1[1];
283+
double _vapi_res1[1];
284+
((double *) _vapi_arg1)[0] = va1;
285+
((double *) _vapi_res1)[0] = vr1;
286+
__internal_dcospi_la_cout (_vapi_arg1, _vapi_res1);
287+
vr1 = ((double *) _vapi_res1)[0];
288+
};
289+
r = vr1;;
290+
291+
return r;
292+
293+
}

IGC/BiFModule/Implementation/Math/cospi.cl

+2-4
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3131
#include "../ExternalLibraries/libclc/trig.cl"
3232

3333
#if defined(cl_khr_fp64)
34-
35-
#include "../ExternalLibraries/libclc/doubles.cl"
36-
34+
#include "../IMF/FP64/cospi_d_la.cl"
3735
#endif // defined(cl_khr_fp64)
3836

3937
INLINE float __builtin_spirv_OpenCL_cospi_f32( float x )
@@ -56,7 +54,7 @@ GENERATE_VECTOR_FUNCTIONS_1ARG_LOOP( __builtin_spirv_OpenCL_cospi, float, float,
5654

5755
INLINE double __builtin_spirv_OpenCL_cospi_f64( double x )
5856
{
59-
return libclc_cospi_f64(x);
57+
return __ocl_svml_cospi(x);
6058
}
6159

6260
GENERATE_VECTOR_FUNCTIONS_1ARG_LOOP( __builtin_spirv_OpenCL_cospi, double, double, f64 )

0 commit comments

Comments
 (0)