Skip to content

Commit 4268296

Browse files
committed
Added AVX512 Support
Bench: 3291373
1 parent 56cb128 commit 4268296

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

src/makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ endif
1818

1919
SSSE = -mssse3 -msse3 -msse2 -msse -mpopcnt
2020
AVX2 = -mavx2 -mavx -mfma -msse4.1 $(SSSE)
21+
AVX512 = -mavx512f -mavx512bw -mavx512dq $(AVX2)
2122
PEXT = -mbmi2 -DUSE_PEXT
2223

2324
all:
@@ -44,4 +45,10 @@ avx2:
4445
avx2-pext:
4546
$(CC) $(RFLAGS) $(DEFS) $(SRC) $(LIBS) $(AVX2) $(PEXT) -o $(EXE)-$(VERSION)-x64-avx2-pext$(EXT)
4647

47-
release: ssse pext avx2 avx2-pext
48+
avx512:
49+
$(CC) $(RFLAGS) $(DEFS) $(SRC) $(LIBS) $(AVX512) -o $(EXE)-$(VERSION)-x64-avx512$(EXT)
50+
51+
avx512-pext:
52+
$(CC) $(RFLAGS) $(DEFS) $(SRC) $(LIBS) $(AVX512) $(PEXT) -o $(EXE)-$(VERSION)-x64-avx512-pext$(EXT)
53+
54+
release: ssse pext avx2 avx2-pext avx512 avx512-pext

src/nn.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,35 @@ inline void RefreshAccumulator(Accumulator accumulator, Board* board, const int
6464
}
6565
}
6666

67-
#if defined(__AVX2__)
67+
#if defined(__AVX512F__)
68+
const size_t WIDTH = sizeof(__m512i) / sizeof(int16_t);
69+
const size_t CHUNKS = N_HIDDEN / WIDTH;
70+
71+
int OutputLayer(Accumulator stm, Accumulator xstm) {
72+
int result = OUTPUT_BIAS * QUANTIZATION_PRECISION_IN;
73+
74+
const __m512i zero = _mm512_setzero_si512();
75+
__m512i s0 = _mm512_setzero_si512();
76+
__m512i s1 = _mm512_setzero_si512();
77+
78+
for (size_t j = 0; j < CHUNKS; j++) {
79+
const __m512i ac0 = _mm512_max_epi16(*(__m512i*)&stm[j * WIDTH], zero);
80+
const __m512i ac1 = _mm512_max_epi16(*(__m512i*)&xstm[j * WIDTH], zero);
81+
82+
s0 = _mm512_add_epi32(s0, _mm512_madd_epi16(ac0, *(__m512i*)&HIDDEN_WEIGHTS[j * WIDTH]));
83+
s1 = _mm512_add_epi32(s1, _mm512_madd_epi16(ac1, *(__m512i*)&HIDDEN_WEIGHTS[j * WIDTH + N_HIDDEN]));
84+
}
85+
86+
const __m512i r16 = _mm512_add_epi32(s0, s1);
87+
const __m256i r8 = _mm256_add_epi32(_mm512_castsi512_si256(r16), _mm512_extracti32x8_epi32(r16, 1));
88+
const __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(r8), _mm256_extractf128_si256(r8, 1));
89+
const __m128i r2 = _mm_add_epi32(r4, _mm_srli_si128(r4, 8));
90+
const __m128i r1 = _mm_add_epi32(r2, _mm_srli_si128(r2, 4));
91+
92+
result += _mm_cvtsi128_si32(r1);
93+
return result / QUANTIZATION_PRECISION_IN / QUANTIZATION_PRECISION_OUT;
94+
}
95+
#elif defined(__AVX2__)
6896
const size_t WIDTH = sizeof(__m256i) / sizeof(int16_t);
6997
const size_t CHUNKS = N_HIDDEN / WIDTH;
7098

0 commit comments

Comments
 (0)