Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/layer/riscv/bias_riscv_zfh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ int Bias_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
for (int q = 0; q < channels; q++)
{
__fp16* ptr = bottom_top_blob.channel(q);
__fp16 bias = bias_data[q];
__fp16 bias = (__fp16)bias_data[q];

#if __riscv_zvfh
int n = size;
Expand Down
8 changes: 4 additions & 4 deletions src/layer/riscv/celu_riscv_zfh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ int CELU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
size_t vl = __riscv_vsetvl_e16m8(n);

vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
vbool2_t _mask = __riscv_vmfgt_vf_f16m8_b2(_p, 0.f, vl);
vbool2_t _mask = __riscv_vmfgt_vf_f16m8_b2(_p, (__fp16)0.f, vl);

vfloat16m8_t _q = __riscv_vfdiv_vf_f16m8(_p, alpha, vl);
vfloat16m8_t _q = __riscv_vfdiv_vf_f16m8(_p, (__fp16)alpha, vl);
_q = exp_ps(_q, vl);
_q = __riscv_vfsub_vf_f16m8(_q, 1.f, vl);
_q = __riscv_vfmul_vf_f16m8(_q, alpha, vl);
_q = __riscv_vfsub_vf_f16m8(_q, (__fp16)1.f, vl);
_q = __riscv_vfmul_vf_f16m8(_q, (__fp16)alpha, vl);

vfloat16m8_t _res = __riscv_vmerge_vvm_f16m8(_q, _p, _mask, vl);
__riscv_vse16_v_f16m8(ptr, _res, vl);
Expand Down
14 changes: 7 additions & 7 deletions src/layer/riscv/convolution1d_riscv_zfh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
float val = (float)*slptr++;
vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w0, vl);

kptr += packn;
}
Expand Down Expand Up @@ -186,7 +186,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
float val = (float)sptr[0];
vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w, vl);

sptr += dilation_w;
kptr += packn;
Expand Down Expand Up @@ -353,7 +353,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c

for (int j = 0; j < outw; j++)
{
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias_term)
{
Expand Down Expand Up @@ -400,7 +400,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c

for (int j = 0; j < outw; j++)
{
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias_term)
{
Expand Down Expand Up @@ -443,14 +443,14 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c

for (int j = 0; j < outw; j++)
{
__fp16 sum = 0.f;
__fp16 sum = (__fp16)0.f;

if (bias_term)
{
sum = ((const __fp16*)bias_data_fp16)[p];
}

vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

const __fp16* kptr = weight_data_fp16.channel(p);

Expand All @@ -471,7 +471,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c

sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl));

sum = activation_ss(sum, activation_type, activation_params);
sum = (__fp16)activation_ss(sum, activation_type, activation_params);

outptr[j] = sum;
}
Expand Down
4 changes: 2 additions & 2 deletions src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
{
Mat out0 = top_blob.channel(p);

vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
out0.fill(_bias0);

const __fp16* k0 = kernel.channel(p);
Expand Down Expand Up @@ -296,7 +296,7 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
{
Mat out0 = top_blob.channel(p);

vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
out0.fill(_bias0);

const __fp16* k0 = kernel.channel(p);
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
{
Mat out0 = top_blob.channel(p);

vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
out0.fill(_bias0);

for (int q = 0; q < inch; q++)
Expand Down
4 changes: 2 additions & 2 deletions src/layer/riscv/convolution_pack1ton_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
{
float val = (float)sptr[space_ofs[k]];
vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl);
_sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w, vl);

kptr += packn;
}
Expand Down Expand Up @@ -126,7 +126,7 @@ static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo
{
for (int j = 0; j < outw; j++)
{
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias_data_ptr)
{
Expand Down
4 changes: 2 additions & 2 deletions src/layer/riscv/convolution_packn_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c
vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl);
// _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl);

vfloat32m2_t _qwq = __riscv_vfwmul_vf_f32m2(_w0, val, vl);
vfloat32m2_t _qwq = __riscv_vfwmul_vf_f32m2(_w0, (__fp16)val, vl);
_sum = __riscv_vfadd_vv_f32m2(_sum, _qwq, vl);

kptr += packn;
Expand Down Expand Up @@ -134,7 +134,7 @@ static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
{
for (int j = 0; j < outw; j++)
{
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias_data_ptr)
{
Expand Down
8 changes: 4 additions & 4 deletions src/layer/riscv/convolution_packnto1_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl));
#endif

sum = activation_ss(sum, activation_type, activation_params);
sum = (__fp16)activation_ss(sum, activation_type, activation_params);

outptr[j] = (__fp16)sum;
}
Expand Down Expand Up @@ -140,14 +140,14 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo
{
for (int j = 0; j < outw; j++)
{
__fp16 sum = 0.f;
__fp16 sum = (__fp16)0.f;

if (bias_data_ptr)
{
sum = bias_data_ptr[p];
}

vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

const __fp16* kptr = weight_data_fp16.channel(p);

Expand All @@ -169,7 +169,7 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo

sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl));

sum = activation_ss(sum, activation_type, activation_params);
sum = (__fp16)activation_ss(sum, activation_type, activation_params);

outptr[j] = sum;
}
Expand Down
8 changes: 4 additions & 4 deletions src/layer/riscv/convolution_sgemm_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
__fp16* outptr6 = top_blob.channel(p + 6);
__fp16* outptr7 = top_blob.channel(p + 7);

const __fp16 zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
const __fp16 zeros[8] = {(__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f};
const __fp16* biasptr = bias ? bias + p : zeros;

int i = 0;
Expand Down Expand Up @@ -224,7 +224,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
__fp16* outptr2 = top_blob.channel(p + 2);
__fp16* outptr3 = top_blob.channel(p + 3);

const __fp16 zeros[4] = {0.f, 0.f, 0.f, 0.f};
const __fp16 zeros[4] = {(__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f};
const __fp16* biasptr = bias ? bias + p : zeros;

int i = 0;
Expand Down Expand Up @@ -302,7 +302,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
{
__fp16* outptr0 = top_blob.channel(p);

const __fp16 bias0 = bias ? bias[p] : 0.f;
const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f;

int i = 0;
for (; i + (packn - 1) < size; i += packn)
Expand Down Expand Up @@ -352,7 +352,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
{
__fp16* outptr0 = top_blob.channel(p);

const __fp16 bias0 = bias ? bias[p] : 0.f;
const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f;

for (int i = 0; i < size; i++)
{
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

int nn = inch * maxk; // inch always > 0

vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias)
{
Expand Down
30 changes: 15 additions & 15 deletions src/layer/riscv/convolution_sgemm_packn_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,14 +262,14 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

int nn = inch * maxk * packn; // inch always > 0

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias)
{
Expand Down Expand Up @@ -324,10 +324,10 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

int nn = inch * maxk * packn; // inch always > 0

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias)
{
Expand Down Expand Up @@ -366,8 +366,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

int nn = inch * maxk * packn; // inch always > 0

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias)
{
Expand Down Expand Up @@ -398,7 +398,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

int nn = inch * maxk * packn; // inch always > 0

vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

if (bias)
{
Expand Down
34 changes: 17 additions & 17 deletions src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
#ifdef __clang__
const __fp16* zeros = _zero_tmp;
#else
const __fp16 zeros[packn] = {0.f};
const __fp16 zeros[packn] = {(__fp16)0.f};
#endif // __clang__
const __fp16* biasptr = bias ? bias + p : zeros;

Expand Down Expand Up @@ -353,7 +353,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
{
__fp16* outptr0 = top_blob.channel(p);

const __fp16 bias0 = bias ? bias[p] : 0.f;
const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f;

int i = 0;
for (; i + 7 < size; i += 8)
Expand All @@ -372,14 +372,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
__fp16 sum6 = bias0;
__fp16 sum7 = bias0;

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

for (int j = 0; j < nn; j++)
{
Expand Down Expand Up @@ -460,10 +460,10 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
__fp16 sum2 = bias0;
__fp16 sum3 = bias0;

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

for (int j = 0; j < nn; j++)
{
Expand Down Expand Up @@ -518,8 +518,8 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
__fp16 sum0 = bias0;
__fp16 sum1 = bias0;

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

for (int j = 0; j < nn; j++)
{
Expand Down Expand Up @@ -561,7 +561,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_

__fp16 sum0 = bias0;

vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl);
vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);

for (int j = 0; j < nn; j++)
{
Expand Down
Loading
Loading