Skip to content

Commit c8f029a

Browse files
authored
Merge pull request #82 from xianyi/develop
rebase
2 parents 0d1f30a + e72430f commit c8f029a

File tree

13 files changed

+555
-15
lines changed

13 files changed

+555
-15
lines changed

Makefile.arm64

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
6666
endif
6767
endif
6868

69+
ifeq ($(CORE), VORTEX)
70+
CCOMMON_OPT += -march=armv8.3-a
71+
FCOMMON_OPT += -march=armv8.3-a
72+
endif
73+
6974
ifeq ($(GCCVERSIONGTEQ9), 1)
7075
ifeq ($(CORE), TSV110)
7176
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110

TargetList.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ THUNDERX
9898
THUNDERX2T99
9999
TSV110
100100
THUNDERX3T110
101+
VORTEX
101102

102103
9.System Z:
103104
ZARCH_GENERIC

benchmark/asum.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,13 @@ int main(int argc, char *argv[]){
128128
int to = 200;
129129
int step = 1;
130130

131+
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
131132
struct timeval start, stop;
132133
double time1,timeg;
134+
#else
135+
struct timespec start = { 0, 0 }, stop = { 0, 0 };
136+
double time1, timeg;
137+
#endif
133138

134139
argc--;argv++;
135140

@@ -160,26 +165,30 @@ int main(int argc, char *argv[]){
160165

161166
fprintf(stderr, " %6d : ", (int)m);
162167

163-
164168
for (l=0; l<loops; l++)
165169
{
166170

167171
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
168172
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
169173
}
170-
174+
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
171175
gettimeofday( &start, (struct timezone *)0);
172-
176+
#else
177+
clock_gettime(CLOCK_REALTIME, &start);
178+
#endif
173179
result = ASUM (&m, x, &inc_x);
174-
175-
gettimeofday( &stop, (struct timezone *)0);
176-
177-
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
180+
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
181+
clock_gettime(CLOCK_REALTIME, &stop);
182+
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
183+
#else
184+
gettimeofday( &stop, (struct timezone *)0);
185+
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
186+
#endif
178187

179188
timeg += time1;
180189

181190
}
182-
191+
if (loops >1)
183192
timeg /= loops;
184193

185194
#ifdef COMPLEX

cpuid_arm64.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
*****************************************************************************/
2727

2828
#include <string.h>
29+
#ifdef OS_DARWIN
30+
#include <sys/sysctl.h>
31+
int32_t value;
32+
size_t length=sizeof(value);
33+
#endif
2934

3035
#define CPU_UNKNOWN 0
3136
#define CPU_ARMV8 1
@@ -45,6 +50,8 @@
4550
#define CPU_TSV110 9
4651
// Ampere
4752
#define CPU_EMAG8180 10
53+
// Apple
54+
#define CPU_VORTEX 13
4855

4956
static char *cpuname[] = {
5057
"UNKNOWN",
@@ -59,7 +66,8 @@ static char *cpuname[] = {
5966
"TSV110",
6067
"EMAG8180",
6168
"NEOVERSEN1",
62-
"THUNDERX3T110"
69+
"THUNDERX3T110",
70+
"VORTEX"
6371
};
6472

6573
static char *cpuname_lower[] = {
@@ -75,7 +83,8 @@ static char *cpuname_lower[] = {
7583
"tsv110",
7684
"emag8180",
7785
"neoversen1",
78-
"thunderx3t110"
86+
"thunderx3t110",
87+
"vortex"
7988
};
8089

8190
int get_feature(char *search)
@@ -198,6 +207,10 @@ int detect(void)
198207

199208
}
200209
#else
210+
#ifdef DARWIN
211+
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
212+
if (value ==131287967) return CPU_VORTEX;
213+
#endif
201214
return CPU_ARMV8;
202215
#endif
203216

@@ -247,7 +260,10 @@ int n=0;
247260

248261
printf("#define NUM_CORES %d\n",n);
249262
#endif
250-
263+
#ifdef DARWIN
264+
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
265+
printf("#define NUM_CORES %d\n",value);
266+
#endif
251267
}
252268

253269

@@ -398,6 +414,19 @@ void get_cpuconfig(void)
398414
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
399415
printf("#define DTB_SIZE 4096 \n");
400416
break;
417+
#ifdef DARWIN
418+
case CPU_VORTEX:
419+
printf("#define VORTEX \n");
420+
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
421+
printf("#define L1_CODE_SIZE %d \n",value);
422+
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
423+
printf("#define L1_CODE_LINESIZE %d \n",value);
424+
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
425+
printf("#define L1_DATA_SIZE %d \n",value);
426+
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
427+
printf("#define L2_DATA_SIZE %d \n",value);
428+
break;
429+
#endif
401430
}
402431
get_cpucount();
403432
}

kernel/arm64/KERNEL.VORTEX

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include $(KERNELDIR)/KERNEL.ARMV8

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
100100
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
101101
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
102102

103+
SASUMKERNEL = sasum.c
104+
DASUMKERNEL = dasum.c

kernel/x86_64/dasum.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#include "common.h"
2+
3+
#ifndef ABS_K
4+
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
5+
#endif
6+
7+
#if defined(SKYLAKEX)
8+
#include "dasum_microk_skylakex-2.c"
9+
#elif defined(HASWELL)
10+
#include "dasum_microk_haswell-2.c"
11+
#endif
12+
13+
#ifndef HAVE_DASUM_KERNEL
14+
static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
15+
{
16+
17+
BLASLONG i=0;
18+
BLASLONG n_8 = n & -8;
19+
FLOAT *x = x1;
20+
FLOAT temp0, temp1, temp2, temp3;
21+
FLOAT temp4, temp5, temp6, temp7;
22+
FLOAT sum0 = 0.0;
23+
FLOAT sum1 = 0.0;
24+
FLOAT sum2 = 0.0;
25+
FLOAT sum3 = 0.0;
26+
FLOAT sum4 = 0.0;
27+
28+
while (i < n_8) {
29+
temp0 = ABS_K(x[0]);
30+
temp1 = ABS_K(x[1]);
31+
temp2 = ABS_K(x[2]);
32+
temp3 = ABS_K(x[3]);
33+
temp4 = ABS_K(x[4]);
34+
temp5 = ABS_K(x[5]);
35+
temp6 = ABS_K(x[6]);
36+
temp7 = ABS_K(x[7]);
37+
38+
sum0 += temp0;
39+
sum1 += temp1;
40+
sum2 += temp2;
41+
sum3 += temp3;
42+
43+
sum0 += temp4;
44+
sum1 += temp5;
45+
sum2 += temp6;
46+
sum3 += temp7;
47+
48+
x+=8;
49+
i+=8;
50+
}
51+
52+
while (i < n) {
53+
sum4 += ABS_K(x1[i]);
54+
i++;
55+
}
56+
57+
return sum0+sum1+sum2+sum3+sum4;
58+
}
59+
60+
#endif
61+
62+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
63+
{
64+
BLASLONG i=0;
65+
FLOAT sumf = 0.0;
66+
67+
if (n <= 0 || inc_x <= 0) return(sumf);
68+
69+
if ( inc_x == 1 ) {
70+
sumf = dasum_kernel(n, x);
71+
}
72+
else {
73+
n *= inc_x;
74+
75+
while(i < n) {
76+
sumf += ABS_K(x[i]);
77+
i += inc_x;
78+
}
79+
}
80+
return(sumf);
81+
}
82+
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
2+
3+
#define HAVE_DASUM_KERNEL
4+
5+
#include <immintrin.h>
6+
#include <stdint.h>
7+
8+
#ifndef ABS_K
9+
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
10+
#endif
11+
12+
static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
13+
{
14+
BLASLONG i = 0;
15+
FLOAT sumf = 0.0;
16+
17+
if (n >= 256) {
18+
BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3;
19+
20+
for (i = 0; i < align_256; i++) {
21+
sumf += ABS_K(x1[i]);
22+
}
23+
24+
n -= align_256;
25+
x1 += align_256;
26+
}
27+
28+
BLASLONG tail_index_SSE = n&(~7);
29+
BLASLONG tail_index_AVX2 = n&(~255);
30+
31+
if (n >= 256) {
32+
__m256d accum_0, accum_1, accum_2, accum_3;
33+
34+
accum_0 = _mm256_setzero_pd();
35+
accum_1 = _mm256_setzero_pd();
36+
accum_2 = _mm256_setzero_pd();
37+
accum_3 = _mm256_setzero_pd();
38+
39+
__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
40+
for (i = 0; i < tail_index_AVX2; i += 16) {
41+
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
42+
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
43+
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
44+
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
45+
}
46+
47+
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
48+
49+
__m128d half_accum0;
50+
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
51+
52+
half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
53+
54+
sumf += half_accum0[0];
55+
}
56+
57+
if (n >= 8) {
58+
__m128d accum_20, accum_21, accum_22, accum_23;
59+
accum_20 = _mm_setzero_pd();
60+
accum_21 = _mm_setzero_pd();
61+
accum_22 = _mm_setzero_pd();
62+
accum_23 = _mm_setzero_pd();
63+
64+
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
65+
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
66+
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
67+
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
68+
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
69+
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
70+
}
71+
72+
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
73+
__m128d half_accum20;
74+
half_accum20 = _mm_hadd_pd(accum_20, accum_20);
75+
76+
sumf += half_accum20[0];
77+
}
78+
79+
for (i = tail_index_SSE; i < n; ++i) {
80+
sumf += ABS_K(x1[i]);
81+
}
82+
83+
return sumf;
84+
85+
}
86+
#endif

0 commit comments

Comments
 (0)