Skip to content

Commit ddec244

Browse files
authored
Merge pull request #2838 from austinpagan/gordon_trmm
Adding performance patch for trmm, just like trsm (#2836)
2 parents f8950f4 + dfeca46 commit ddec244

File tree

2 files changed

+10
-10
lines changed

2 files changed

+10
-10
lines changed

driver/level3/trmm_L.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
139139
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
140140
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
141141
#else
142-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
142+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
143143
else
144144
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
145145
#endif
@@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
209209
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
210210
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
211211
#else
212-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
212+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
213213
else
214214
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
215215
#endif
@@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
304304
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
305305
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
306306
#else
307-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
307+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
308308
else
309309
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
310310
#endif
@@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
374374
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
375375
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
376376
#else
377-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
377+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
378378
else
379379
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
380380
#endif

driver/level3/trmm_R.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
126126
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
127127
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
128128
#else
129-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
129+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
130130
else
131131
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
132132
#endif
@@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
150150
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
151151
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
152152
#else
153-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
153+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
154154
else
155155
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
156156
#endif
@@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
207207
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
208208
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
209209
#else
210-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
210+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
211211
else
212212
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
213213
#endif
@@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
262262
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
263263
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
264264
#else
265-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
265+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
266266
else
267267
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
268268
#endif
@@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
287287
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
288288
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
289289
#else
290-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
290+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
291291
else
292292
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
293293
#endif
@@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
348348
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
349349
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
350350
#else
351-
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
351+
if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
352352
else
353353
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
354354
#endif

0 commit comments

Comments
 (0)