From cc37862bf6345957a227d56679b8f1f9925c30a1 Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Fri, 23 Jan 2026 09:28:19 +0800 Subject: [PATCH 1/7] fix:small part of convolution_winograd_transform_packn_fp16s.h --- ...nvolution_winograd_transform_packn_fp16s.h | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h index c450a60063ba..1d779a2aa7fc 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h @@ -70,29 +70,29 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot vfloat16m1_t _r06 = __riscv_vle16_v_f16m1(r0 + packn * 6, vl); vfloat16m1_t _r07 = __riscv_vle16_v_f16m1(r0 + packn * 7, vl); - vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, __riscv_vfsub_vv_f16m1(_r04, _r02, vl), vl); - vfloat16m1_t _tmp7m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, __riscv_vfsub_vv_f16m1(_r03, _r05, vl), vl); + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r00, _r06, vl), (__fp16)5.25f, __riscv_vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r07, _r01, vl), (__fp16)5.25f, __riscv_vfsub_vv_f16m1(_r03, _r05, vl), vl); __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); __riscv_vse16_v_f16m1(tmp[7][m], _tmp7m, vl); - vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); - vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r02, _r06, vl), (__fp16)-4.25f, _r04, vl); + vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), (__fp16)-4.25f, _r03, vl); vfloat16m1_t _tmp1m = __riscv_vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); vfloat16m1_t _tmp2m = __riscv_vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); - vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, vl); + vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r06, (__fp16)0.25f, _r02, vl), (__fp16)-1.25f, _r04, vl); + vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)0.5f, vl), (__fp16)-2.5f, _r03, vl), (__fp16)2.f, _r05, vl); vfloat16m1_t _tmp3m = __riscv_vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); vfloat16m1_t _tmp4m = __riscv_vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_r06, 4.f, __riscv_vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); - vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, vl); + vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_r06, 4.f, __riscv_vfmacc_vf_f16m1(_r02, (__fp16)-1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)2.f, vl), (__fp16)-2.5f, _r03, vl), (__fp16)0.5f, _r05, vl); vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); vfloat16m1_t _tmp6m = __riscv_vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); @@ -122,23 +122,23 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot vfloat16m1_t _tmp06 = __riscv_vle16_v_f16m1(tmp[m][6], vl); vfloat16m1_t _tmp07 = __riscv_vle16_v_f16m1(tmp[m][7], vl); - vfloat16m1_t _r0tm0 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, __riscv_vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); - vfloat16m1_t _r0tm7 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, __riscv_vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + vfloat16m1_t _r0tm0 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp00, _tmp06, vl), (__fp16)5.25f, __riscv_vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp07, _tmp01, vl), (__fp16)5.25f, __riscv_vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); - vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); - vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp02, _tmp06, vl), (__fp16)-4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp01, _tmp05, vl), (__fp16)-4.25f, _tmp03, vl); vfloat16m1_t _r0tm1 = __riscv_vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); vfloat16m1_t _r0tm2 = __riscv_vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); - vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); + vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), (__fp16)-1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, (__fp16)0.5f, vl), (__fp16)-2.5f, _tmp03, vl), (__fp16)2.f, _tmp05, vl); vfloat16m1_t _r0tm3 = __riscv_vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); vfloat16m1_t _r0tm4 = __riscv_vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_tmp06, 4.f, __riscv_vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); - vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, _tmp05, vl); + vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_tmp06, (__fp16)4.f, __riscv_vfmacc_vf_f16m1(_tmp02, (__fp16)-1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, (__fp16)2.f, vl), (__fp16)-2.5f, _tmp03, vl), (__fp16)0.5f, _tmp05, vl); vfloat16m1_t _r0tm5 = __riscv_vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); vfloat16m1_t _r0tm6 = __riscv_vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); @@ -203,7 +203,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); // NOTE c99 variable length array __fp16 tmp[6][8][packn]; @@ -244,15 +244,15 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to vfloat16m1_t _tmp024c = __riscv_vfadd_vv_f16m1(_out0tm5, _out0tm6, vl); vfloat16m1_t _tmp135c = __riscv_vfsub_vv_f16m1(_out0tm5, _out0tm6, vl); - vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm0, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); - vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); - vfloat16m1_t _tmp4m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm0, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, (__fp16)32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, (__fp16)4.f, _tmp024b, vl), (__fp16)8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, (__fp16)16.f, _tmp024b, vl), (__fp16)2.f, _tmp024c, vl); __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); - vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)2.f, _tmp135b, vl), (__fp16)16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)8.f, _tmp135b, vl), (__fp16)4.f, _tmp135c, vl); vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm7, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); @@ -288,16 +288,16 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to vfloat16m1_t _tmp024c = __riscv_vfadd_vv_f16m1(_tmp05, _tmp06, vl); vfloat16m1_t _tmp135c = __riscv_vfsub_vv_f16m1(_tmp05, _tmp06, vl); - vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp00, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl), vl); - vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl), vl); - vfloat16m1_t _out04 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl), vl); + vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp00, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, (__fp16)32.f, _tmp024c, vl), vl), vl); + vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, (__fp16)4.f, _tmp024b, vl), (__fp16)8.f, _tmp024c, vl), vl); + vfloat16m1_t _out04 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, (__fp16)16.f, _tmp024b, vl), (__fp16)2.f, _tmp024c, vl), vl); __riscv_vse16_v_f16m1(output0, _out00, vl); __riscv_vse16_v_f16m1(output0 + packn * 2, _out02, vl); __riscv_vse16_v_f16m1(output0 + packn * 4, _out04, vl); - vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl), vl); - vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl), vl); - vfloat16m1_t _out05 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp07, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl), vl); + vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)2.f, _tmp135b, vl), (__fp16)16.f, _tmp135c, vl), vl); + vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)8.f, _tmp135b, vl), (__fp16)4.f, _tmp135c, vl), vl); + vfloat16m1_t _out05 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp07, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, (__fp16)32.f, _tmp135b, vl), vl), vl); __riscv_vse16_v_f16m1(output0 + packn, _out01, vl); __riscv_vse16_v_f16m1(output0 + packn * 3, _out03, vl); __riscv_vse16_v_f16m1(output0 + packn * 5, _out05, vl); From 725dcc830c471d3c90fe5abfe3395825683084d2 Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Fri, 23 Jan 2026 14:30:13 +0800 Subject: [PATCH 2/7] fix: no converting warning in self-test --- src/layer/riscv/celu_riscv_zfh.cpp | 4 +- src/layer/riscv/convolution1d_riscv_zfh.cpp | 8 ++-- .../riscv/convolution_3x3_pack1ton_fp16s.h | 4 +- .../riscv/convolution_7x7_pack1ton_fp16s.h | 2 +- src/layer/riscv/convolution_pack1ton_fp16s.h | 2 +- src/layer/riscv/convolution_packn_fp16s.h | 2 +- src/layer/riscv/convolution_packnto1_fp16s.h | 4 +- src/layer/riscv/convolution_sgemm.h | 8 ++-- src/layer/riscv/convolution_sgemm_fp16s.h | 8 ++-- .../riscv/convolution_sgemm_pack1ton_fp16s.h | 2 +- .../riscv/convolution_sgemm_packn_fp16s.h | 30 ++++++------ src/layer/riscv/convolution_sgemm_packnto1.h | 2 +- .../riscv/convolution_sgemm_packnto1_fp16s.h | 34 +++++++------- .../convolution_winograd_dot_packn_fp16s.h | 30 ++++++------ .../riscv/convolution_winograd_transform.h | 4 +- ...nvolution_winograd_transform_packn_fp16s.h | 46 +++++++++---------- src/layer/riscv/convolutiondepthwise_3x3.h | 4 +- .../riscv/deconvolution_pack1ton_fp16s.h | 2 +- src/layer/riscv/deconvolution_packn_fp16s.h | 2 +- .../riscv/deconvolution_packnto1_fp16s.h | 4 +- src/layer/riscv/innerproduct_riscv_zfh.cpp | 6 +-- src/layer/riscv/instancenorm_riscv_zfh.cpp | 16 +++---- 22 files changed, 112 insertions(+), 112 deletions(-) diff --git a/src/layer/riscv/celu_riscv_zfh.cpp b/src/layer/riscv/celu_riscv_zfh.cpp index 502eee52deed..c817786e76e9 100644 --- a/src/layer/riscv/celu_riscv_zfh.cpp +++ b/src/layer/riscv/celu_riscv_zfh.cpp @@ -32,11 +32,11 @@ int CELU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c size_t vl = __riscv_vsetvl_e16m8(n); vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - vbool2_t _mask = __riscv_vmfgt_vf_f16m8_b2(_p, 0.f, vl); + vbool2_t _mask = __riscv_vmfgt_vf_f16m8_b2(_p, (__fp16)0.f, vl); vfloat16m8_t _q = __riscv_vfdiv_vf_f16m8(_p, alpha, vl); _q = exp_ps(_q, vl); - _q = __riscv_vfsub_vf_f16m8(_q, 1.f, vl); + _q = __riscv_vfsub_vf_f16m8(_q, (__fp16)1.f, vl); _q = __riscv_vfmul_vf_f16m8(_q, alpha, vl); vfloat16m8_t _res = __riscv_vmerge_vvm_f16m8(_q, _p, _mask, vl); diff --git a/src/layer/riscv/convolution1d_riscv_zfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp index aafe723060f3..6727993452ed 100644 --- a/src/layer/riscv/convolution1d_riscv_zfh.cpp +++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp @@ -353,7 +353,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_term) { @@ -400,7 +400,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_term) { @@ -443,14 +443,14 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c for (int j = 0; j < outw; j++) { - __fp16 sum = 0.f; + __fp16 sum = (__fp16)0.f; if (bias_term) { sum = ((const __fp16*)bias_data_fp16)[p]; } - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* kptr = weight_data_fp16.channel(p); diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h index b5648bfd1f00..664ed93e3ec7 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h @@ -18,7 +18,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); out0.fill(_bias0); const __fp16* k0 = kernel.channel(p); @@ -296,7 +296,7 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); out0.fill(_bias0); const __fp16* k0 = kernel.channel(p); diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h index 0d94d59411f5..30ea12cb632b 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h @@ -22,7 +22,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); out0.fill(_bias0); for (int q = 0; q < inch; q++) diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h index a9450289a55b..771679261253 100644 --- a/src/layer/riscv/convolution_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_pack1ton_fp16s.h @@ -126,7 +126,7 @@ static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_data_ptr) { diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h index bb2cab09c006..52ba3c997275 100644 --- a/src/layer/riscv/convolution_packn_fp16s.h +++ b/src/layer/riscv/convolution_packn_fp16s.h @@ -134,7 +134,7 @@ static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_data_ptr) { diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h index b49c53e20bdb..a14364fcea29 100644 --- a/src/layer/riscv/convolution_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_packnto1_fp16s.h @@ -140,14 +140,14 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo { for (int j = 0; j < outw; j++) { - __fp16 sum = 0.f; + __fp16 sum = (__fp16)0.f; if (bias_data_ptr) { sum = bias_data_ptr[p]; } - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* kptr = weight_data_fp16.channel(p); diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h index 2b7432b29db2..b5ef3204bc1f 100644 --- a/src/layer/riscv/convolution_sgemm.h +++ b/src/layer/riscv/convolution_sgemm.h @@ -122,7 +122,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr6 = top_blob.channel(p + 6); float* outptr7 = top_blob.channel(p + 7); - const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float zeros[8] = {(__fp16)0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -237,7 +237,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr2 = top_blob.channel(p + 2); float* outptr3 = top_blob.channel(p + 3); - const float zeros[4] = {0.f, 0.f, 0.f, 0.f}; + const float zeros[4] = {(__fp16)0.f, 0.f, 0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -321,7 +321,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr0 = top_blob.channel(p); float* outptr1 = top_blob.channel(p + 1); - const float zeros[2] = {0.f, 0.f}; + const float zeros[2] = {(__fp16)0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -401,7 +401,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& { float* outptr0 = top_blob.channel(p); - const float bias0 = bias ? bias[p] : 0.f; + const float bias0 = bias ? bias[p] : (__fp16)0.f; int i = 0; #if __riscv_vector diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h index 3bcb89fb3d54..51cf78040e78 100644 --- a/src/layer/riscv/convolution_sgemm_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_fp16s.h @@ -109,7 +109,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con __fp16* outptr6 = top_blob.channel(p + 6); __fp16* outptr7 = top_blob.channel(p + 7); - const __fp16 zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const __fp16 zeros[8] = {(__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f}; const __fp16* biasptr = bias ? bias + p : zeros; int i = 0; @@ -224,7 +224,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con __fp16* outptr2 = top_blob.channel(p + 2); __fp16* outptr3 = top_blob.channel(p + 3); - const __fp16 zeros[4] = {0.f, 0.f, 0.f, 0.f}; + const __fp16 zeros[4] = {(__fp16)0.f, (__fp16)0.f, (__fp16)0.f, (__fp16)0.f}; const __fp16* biasptr = bias ? bias + p : zeros; int i = 0; @@ -302,7 +302,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : 0.f; + const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; int i = 0; for (; i + (packn - 1) < size; i += packn) @@ -352,7 +352,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : 0.f; + const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; for (int i = 0; i < size; i++) { diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h index 49ddd2081fc9..fdb2ea194688 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h @@ -52,7 +52,7 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk; // inch always > 0 - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias) { diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h index c8fbd53e2747..16a879f4d180 100644 --- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h @@ -262,14 +262,14 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias) { @@ -324,10 +324,10 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias) { @@ -366,8 +366,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias) { @@ -398,7 +398,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias) { diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h index 910b19bbbb93..bab7a15c9e9a 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1.h +++ b/src/layer/riscv/convolution_sgemm_packnto1.h @@ -353,7 +353,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c { float* outptr0 = top_blob.channel(p); - const float bias0 = bias ? bias[p] : 0.f; + const float bias0 = bias ? bias[p] : (__fp16)0.f; int i = 0; for (; i + 7 < size; i += 8) diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h index 0423ccfa7d74..c7390dc7364b 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h @@ -197,7 +197,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ #ifdef __clang__ const __fp16* zeros = _zero_tmp; #else - const __fp16 zeros[packn] = {0.f}; + const __fp16 zeros[packn] = {(__fp16)0.f}; #endif // __clang__ const __fp16* biasptr = bias ? bias + p : zeros; @@ -353,7 +353,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : 0.f; + const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; int i = 0; for (; i + 7 < size; i += 8) @@ -372,14 +372,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum6 = bias0; __fp16 sum7 = bias0; - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -460,10 +460,10 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum2 = bias0; __fp16 sum3 = bias0; - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -518,8 +518,8 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum0 = bias0; __fp16 sum1 = bias0; - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -561,7 +561,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum0 = bias0; - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h index fed43f8e67c8..56ac33ab9d72 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h @@ -176,14 +176,14 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -226,10 +226,10 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -260,8 +260,8 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { @@ -286,7 +286,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int j = 0; j < nn; j++) { diff --git a/src/layer/riscv/convolution_winograd_transform.h b/src/layer/riscv/convolution_winograd_transform.h index 521bc88dccfb..b89293bd3e5c 100644 --- a/src/layer/riscv/convolution_winograd_transform.h +++ b/src/layer/riscv/convolution_winograd_transform.h @@ -144,7 +144,7 @@ static void conv3x3s1_winograd43_transform_output_rvv(const Mat& top_blob_tm, Ma const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - float bias0 = biasptr ? biasptr[p] : 0.f; + float bias0 = biasptr ? biasptr[p] : (__fp16)0.f; float tmp[4][6]; @@ -335,7 +335,7 @@ static void conv3x3s1_winograd23_transform_output_rvv(const Mat& top_blob_tm, Ma const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - float bias0 = biasptr ? biasptr[p] : 0.f; + float bias0 = biasptr ? biasptr[p] : (__fp16)0.f; float tmp[2][4]; diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h index 1d779a2aa7fc..69423dad5b99 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h @@ -91,7 +91,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_r06, 4.f, __riscv_vfmacc_vf_f16m1(_r02, (__fp16)-1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_r06, (__fp16)4.f, __riscv_vfmacc_vf_f16m1(_r02, (__fp16)-1.25f, _r04, vl), vl); vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)2.f, vl), (__fp16)-2.5f, _r03, vl), (__fp16)0.5f, _r05, vl); vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); @@ -131,7 +131,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot vfloat16m1_t _r0tm1 = __riscv_vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); vfloat16m1_t _r0tm2 = __riscv_vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), (__fp16)-1.25f, _tmp04, vl); + vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp06, (__fp16)0.25f, _tmp02, vl), (__fp16)-1.25f, _tmp04, vl); vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, (__fp16)0.5f, vl), (__fp16)-2.5f, _tmp03, vl), (__fp16)2.f, _tmp05, vl); vfloat16m1_t _r0tm3 = __riscv_vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); @@ -253,7 +253,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)2.f, _tmp135b, vl), (__fp16)16.f, _tmp135c, vl); vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, (__fp16)8.f, _tmp135b, vl), (__fp16)4.f, _tmp135c, vl); - vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm7, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm7, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, (__fp16)32.f, _tmp135b, vl), vl); __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); __riscv_vse16_v_f16m1(tmp[5][m], _tmp5m, vl); @@ -366,17 +366,17 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(r0 + packn * 5, vl); - vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); - vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); + vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)sq2, vl), (__fp16)-sq2_d2, _r03, vl); + vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, (__fp16)-2.f, _r02, vl); + vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)sq2_d2, vl), (__fp16)-sq2, _r03, vl); + vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, (__fp16)-0.5f, _r02, vl); - vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), (__fp16)-2.5f, _r02, vl); vfloat16m1_t _tmp1m = __riscv_vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); vfloat16m1_t _tmp2m = __riscv_vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); vfloat16m1_t _tmp3m = __riscv_vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); vfloat16m1_t _tmp4m = __riscv_vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); + vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), (__fp16)-2.5f, _r03, vl); __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); @@ -404,17 +404,17 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(tmp[m][4], vl); vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(tmp[m][5], vl); - vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); - vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); + vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)sq2, vl), (__fp16)-sq2_d2, _r03, vl); + vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, (__fp16)-2.f, _r02, vl); + vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, (__fp16)sq2_d2, vl), (__fp16)-sq2, _r03, vl); + vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, (__fp16)-0.5f, _r02, vl); - vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), (__fp16)-2.5f, _r02, vl); vfloat16m1_t _tmp1m = __riscv_vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); vfloat16m1_t _tmp2m = __riscv_vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); vfloat16m1_t _tmp3m = __riscv_vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); vfloat16m1_t _tmp4m = __riscv_vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); + vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), (__fp16)-2.5f, _r03, vl); __riscv_vse16_v_f16m1(r0_tm_0, _tmp0m, vl); __riscv_vse16_v_f16m1(r0_tm_1, _tmp1m, vl); @@ -473,7 +473,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); // NOTE variable length array __fp16 tmp[4][6][packn]; @@ -507,9 +507,9 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to vfloat16m1_t _tmp13b = __riscv_vfsub_vv_f16m1(_r03, _r04, vl); vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); - vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl); - vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl); - vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl); + vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, (__fp16)sq2_d2, vl), (__fp16)sq2, _tmp13b, vl); + vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, (__fp16)0.5f, vl), (__fp16)2.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, (__fp16)sq2_d4, _tmp13a, vl), (__fp16)sq2_m2, _tmp13b, vl); __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); @@ -539,9 +539,9 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to vfloat16m1_t _tmp13b = __riscv_vfsub_vv_f16m1(_r03, _r04, vl); vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl), vl); - vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl), vl); - vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl), vl); - vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl), vl); + vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, (__fp16)sq2_d2, vl), (__fp16)sq2, _tmp13b, vl), vl); + vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, (__fp16)0.5f, vl), (__fp16)2.f, _tmp02b, vl), vl); + vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, (__fp16)sq2_d4, _tmp13a, vl), (__fp16)sq2_m2, _tmp13b, vl), vl); __riscv_vse16_v_f16m1(output0, _out00, vl); __riscv_vse16_v_f16m1(output0 + packn, _out01, vl); @@ -677,7 +677,7 @@ static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); // NOTE variable length array __fp16 tmp[2][4][packn]; diff --git a/src/layer/riscv/convolutiondepthwise_3x3.h b/src/layer/riscv/convolutiondepthwise_3x3.h index 8e44409a7ce8..30945d928dd5 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3.h +++ b/src/layer/riscv/convolutiondepthwise_3x3.h @@ -18,7 +18,7 @@ static void convdw3x3s1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& _k { Mat out = top_blob.channel(g); - const float bias0 = bias ? bias[g] : 0.f; + const float bias0 = bias ? bias[g] : (__fp16)0.f; const float* kernel0 = kernel + g * 9; @@ -133,7 +133,7 @@ static void convdw3x3s2_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& _k { Mat out = top_blob.channel(g); - const float bias0 = bias ? bias[g] : 0.f; + const float bias0 = bias ? bias[g] : (__fp16)0.f; const float* kernel0 = kernel + g * 9; diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h index 45755f460393..5618aab15423 100644 --- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h +++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h @@ -119,7 +119,7 @@ static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_data_ptr) { diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h index 1695aaa0133f..c99373082446 100644 --- a/src/layer/riscv/deconvolution_packn_fp16s.h +++ b/src/layer/riscv/deconvolution_packn_fp16s.h @@ -121,7 +121,7 @@ static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_data_ptr) { diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index 4ed9f9ff922e..60c1727499d2 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -132,14 +132,14 @@ static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b { for (int j = 0; j < outw; j++) { - __fp16 sum = 0.f; + __fp16 sum = (__fp16)0.f; if (bias_data_ptr) { sum = bias_data_ptr[p]; } - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p * packn; diff --git a/src/layer/riscv/innerproduct_riscv_zfh.cpp b/src/layer/riscv/innerproduct_riscv_zfh.cpp index 79fc0f4cbfe3..6149f6eefeca 100644 --- a/src/layer/riscv/innerproduct_riscv_zfh.cpp +++ b/src/layer/riscv/innerproduct_riscv_zfh.cpp @@ -420,7 +420,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; const __fp16* m = bottom_blob.row(j); - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_term) { @@ -457,7 +457,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; const __fp16* m = bottom_blob.row(j); - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_term) { @@ -550,7 +550,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co for (int p = 0; p < num_output / out_elempack; p++) { const size_t vl = __riscv_vsetvl_e16m1(packn); - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); if (bias_term) { diff --git a/src/layer/riscv/instancenorm_riscv_zfh.cpp b/src/layer/riscv/instancenorm_riscv_zfh.cpp index 5cad74b13dfb..22aadcdee5f3 100644 --- a/src/layer/riscv/instancenorm_riscv_zfh.cpp +++ b/src/layer/riscv/instancenorm_riscv_zfh.cpp @@ -202,11 +202,11 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio __fp16* ptr = bottom_top_blob.channel(q); // mean and var - __fp16 sum = 0.f; - __fp16 sqsum = 0.f; + __fp16 sum = (__fp16)0.f; + __fp16 sqsum = (__fp16)0.f; #if __riscv_zvfh && !defined(C906) - vfloat16m1_t _sum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); - vfloat16m1_t _sqsum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); + vfloat16m1_t _sum = __riscv_vfmv_s_f_f16m1((__fp16)0.f, __riscv_vsetvlmax_e32m1()); + vfloat16m1_t _sqsum = __riscv_vfmv_s_f_f16m1((__fp16)0.f, __riscv_vsetvlmax_e32m1()); { int n = size; __fp16* ptr_sum = ptr; @@ -305,8 +305,8 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio for (int q = 0; q < c; q++) { __fp16* ptr = bottom_top_blob.channel(q); - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sqsum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _sqsum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); for (int i = 0; i < size; i++) { @@ -331,12 +331,12 @@ int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Optio { vfloat16m1_t _gamma = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2((const float*)gamma_data + q * vl, vl), vl); vfloat16m1_t _beta = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2((const float*)beta_data + q * vl, vl), vl); - _a = __riscv_vfdiv_vv_f16m1(_gamma, __riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, eps, vl), vl), vl); + _a = __riscv_vfdiv_vv_f16m1(_gamma, __riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, (__fp16)eps, vl), vl), vl); _b = __riscv_vfnmsub_vv_f16m1(_a, _mean, _beta, vl); } else { - _a = __riscv_vfrdiv_vf_f16m1(__riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); + _a = __riscv_vfrdiv_vf_f16m1(__riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, (__fp16)eps, vl), vl), (__fp16)1.f, vl); _b = __riscv_vfmul_vv_f16m1(_a, _mean, vl); _b = __riscv_vfsgnjn_vv_f16m1(_b, _b, vl); } From fc73c6f72ee9e0f8120b657ba22cccf4ebc512ef Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Mon, 26 Jan 2026 16:42:36 +0800 Subject: [PATCH 3/7] fix: Fix the remaining warnings --- src/layer/riscv/bias_riscv_zfh.cpp | 2 +- src/layer/riscv/celu_riscv_zfh.cpp | 4 ++-- src/layer/riscv/convolution1d_riscv_zfh.cpp | 4 ++-- src/layer/riscv/convolution_pack1ton_fp16s.h | 2 +- src/layer/riscv/convolution_packn_fp16s.h | 2 +- src/layer/riscv/convolution_packnto1_fp16s.h | 4 ++-- src/layer/riscv/convolution_sgemm_fp16s.h | 4 ++-- .../riscv/convolution_sgemm_packnto1_fp16s.h | 2 +- src/layer/riscv/gru_riscv_zfh.cpp | 4 ++-- src/layer/riscv/interp_bicubic_packn_fp16s.h | 20 +++++++++---------- src/layer/riscv/interp_bilinear_packn_fp16s.h | 6 +++--- src/layer/riscv/interp_riscv_zfh.cpp | 4 ++-- 12 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/layer/riscv/bias_riscv_zfh.cpp b/src/layer/riscv/bias_riscv_zfh.cpp index 84d3540b0964..bcfd30f92378 100644 --- a/src/layer/riscv/bias_riscv_zfh.cpp +++ b/src/layer/riscv/bias_riscv_zfh.cpp @@ -25,7 +25,7 @@ int Bias_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c for (int q = 0; q < channels; q++) { __fp16* ptr = bottom_top_blob.channel(q); - __fp16 bias = bias_data[q]; + __fp16 bias = (__fp16)bias_data[q]; #if __riscv_zvfh int n = size; diff --git a/src/layer/riscv/celu_riscv_zfh.cpp b/src/layer/riscv/celu_riscv_zfh.cpp index c817786e76e9..295b8b2b79d3 100644 --- a/src/layer/riscv/celu_riscv_zfh.cpp +++ b/src/layer/riscv/celu_riscv_zfh.cpp @@ -34,10 +34,10 @@ int CELU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); vbool2_t _mask = __riscv_vmfgt_vf_f16m8_b2(_p, (__fp16)0.f, vl); - vfloat16m8_t _q = __riscv_vfdiv_vf_f16m8(_p, alpha, vl); + vfloat16m8_t _q = __riscv_vfdiv_vf_f16m8(_p, (__fp16)alpha, vl); _q = exp_ps(_q, vl); _q = __riscv_vfsub_vf_f16m8(_q, (__fp16)1.f, vl); - _q = __riscv_vfmul_vf_f16m8(_q, alpha, vl); + _q = __riscv_vfmul_vf_f16m8(_q, (__fp16)alpha, vl); vfloat16m8_t _res = __riscv_vmerge_vvm_f16m8(_q, _p, _mask, vl); __riscv_vse16_v_f16m8(ptr, _res, vl); diff --git a/src/layer/riscv/convolution1d_riscv_zfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp index 6727993452ed..403bb0cb675d 100644 --- a/src/layer/riscv/convolution1d_riscv_zfh.cpp +++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp @@ -143,7 +143,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co { float val = (float)*slptr++; vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); - _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w0, vl); kptr += packn; } @@ -186,7 +186,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co { float val = (float)sptr[0]; vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); - _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w, vl); sptr += dilation_w; kptr += packn; diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h index 771679261253..6d34c63abf8c 100644 --- a/src/layer/riscv/convolution_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_pack1ton_fp16s.h @@ -65,7 +65,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob { float val = (float)sptr[space_ofs[k]]; vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); - _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, (__fp16)val, _w, vl); kptr += packn; } diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h index 52ba3c997275..91c25e78c3c9 100644 --- a/src/layer/riscv/convolution_packn_fp16s.h +++ b/src/layer/riscv/convolution_packn_fp16s.h @@ -71,7 +71,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); // _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl); - vfloat32m2_t _qwq = __riscv_vfwmul_vf_f32m2(_w0, val, vl); + vfloat32m2_t _qwq = __riscv_vfwmul_vf_f32m2(_w0, (__fp16)val, vl); _sum = __riscv_vfadd_vv_f32m2(_sum, _qwq, vl); kptr += packn; diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h index a14364fcea29..34e216fe040d 100644 --- a/src/layer/riscv/convolution_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_packnto1_fp16s.h @@ -85,7 +85,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); #endif - sum = activation_ss(sum, activation_type, activation_params); + sum = (__fp16)activation_ss(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } @@ -169,7 +169,7 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); - sum = activation_ss(sum, activation_type, activation_params); + sum = (__fp16)activation_ss(sum, activation_type, activation_params); outptr[j] = sum; } diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h index 51cf78040e78..1081d7932ae3 100644 --- a/src/layer/riscv/convolution_sgemm_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_fp16s.h @@ -302,7 +302,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; + const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f; int i = 0; for (; i + (packn - 1) < size; i += packn) @@ -352,7 +352,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; + const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f; for (int i = 0; i < size; i++) { diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h index c7390dc7364b..fcf3ed34030a 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h @@ -353,7 +353,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ { __fp16* outptr0 = top_blob.channel(p); - const __fp16 bias0 = bias ? bias[p] : (__fp16)0.f; + const __fp16 bias0 = bias ? (__fp16)bias[p] : (__fp16)0.f; int i = 0; for (; i + 7 < size; i += 8) diff --git a/src/layer/riscv/gru_riscv_zfh.cpp b/src/layer/riscv/gru_riscv_zfh.cpp index cc3c4fd56a5a..ba2ba24166aa 100644 --- a/src/layer/riscv/gru_riscv_zfh.cpp +++ b/src/layer/riscv/gru_riscv_zfh.cpp @@ -491,8 +491,8 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const // sigmoid(R) // sigmoid(U) - R = 1.f / (1.f + (__fp16)expf((float)(-R))); - U = 1.f / (1.f + (__fp16)expf((float)(-U))); + R = (__fp16)(1.f / (1.f + expf((float)(-R)))); + U = (__fp16)(1.f / (1.f + expf((float)(-U)))); // gate new const __fp16* bias_c_WN = bias_c.row(2); diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h index bfa3036fabda..c86d00033113 100644 --- a/src/layer/riscv/interp_bicubic_packn_fp16s.h +++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h @@ -50,7 +50,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); @@ -86,8 +86,8 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); @@ -132,9 +132,9 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); @@ -180,10 +180,10 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h index 3b2fa7446452..7e00c3efb62b 100644 --- a/src/layer/riscv/interp_bilinear_packn_fp16s.h +++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h @@ -43,7 +43,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); @@ -70,8 +70,8 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p + packn, vl); vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), alphap[1], _S01, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl); __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); diff --git a/src/layer/riscv/interp_riscv_zfh.cpp b/src/layer/riscv/interp_riscv_zfh.cpp index 6261353ccc34..acef5e4b5247 100644 --- a/src/layer/riscv/interp_riscv_zfh.cpp +++ b/src/layer/riscv/interp_riscv_zfh.cpp @@ -150,7 +150,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); - vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S1, vl); __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); @@ -189,7 +189,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); - vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S1, vl), (__fp16)alphap[2], _S2, vl), (__fp16)alphap[3], _S3, vl); __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); From 5732487e5d200d8e1391f604ef9fe46deb5aefec Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Wed, 28 Jan 2026 19:03:58 +0800 Subject: [PATCH 4/7] Fix: solve compile problem --- src/layer/riscv/convolution_sgemm.h | 8 ++++---- src/layer/riscv/convolution_winograd_transform.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h index b5ef3204bc1f..2b7432b29db2 100644 --- a/src/layer/riscv/convolution_sgemm.h +++ b/src/layer/riscv/convolution_sgemm.h @@ -122,7 +122,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr6 = top_blob.channel(p + 6); float* outptr7 = top_blob.channel(p + 7); - const float zeros[8] = {(__fp16)0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -237,7 +237,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr2 = top_blob.channel(p + 2); float* outptr3 = top_blob.channel(p + 3); - const float zeros[4] = {(__fp16)0.f, 0.f, 0.f, 0.f}; + const float zeros[4] = {0.f, 0.f, 0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -321,7 +321,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& float* outptr0 = top_blob.channel(p); float* outptr1 = top_blob.channel(p + 1); - const float zeros[2] = {(__fp16)0.f, 0.f}; + const float zeros[2] = {0.f, 0.f}; const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -401,7 +401,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& { float* outptr0 = top_blob.channel(p); - const float bias0 = bias ? bias[p] : (__fp16)0.f; + const float bias0 = bias ? bias[p] : 0.f; int i = 0; #if __riscv_vector diff --git a/src/layer/riscv/convolution_winograd_transform.h b/src/layer/riscv/convolution_winograd_transform.h index b89293bd3e5c..521bc88dccfb 100644 --- a/src/layer/riscv/convolution_winograd_transform.h +++ b/src/layer/riscv/convolution_winograd_transform.h @@ -144,7 +144,7 @@ static void conv3x3s1_winograd43_transform_output_rvv(const Mat& top_blob_tm, Ma const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - float bias0 = biasptr ? biasptr[p] : (__fp16)0.f; + float bias0 = biasptr ? biasptr[p] : 0.f; float tmp[4][6]; @@ -335,7 +335,7 @@ static void conv3x3s1_winograd23_transform_output_rvv(const Mat& top_blob_tm, Ma const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - float bias0 = biasptr ? biasptr[p] : (__fp16)0.f; + float bias0 = biasptr ? biasptr[p] : 0.f; float tmp[2][4]; From 24a8db3c4d360e5f48778daa2501d7061769f16c Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Wed, 28 Jan 2026 19:41:14 +0800 Subject: [PATCH 5/7] Fix: solve compile problem --- src/layer/riscv/convolution_sgemm_packnto1.h | 2 +- src/layer/riscv/convolutiondepthwise_3x3.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h index bab7a15c9e9a..910b19bbbb93 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1.h +++ b/src/layer/riscv/convolution_sgemm_packnto1.h @@ -353,7 +353,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c { float* outptr0 = top_blob.channel(p); - const float bias0 = bias ? bias[p] : (__fp16)0.f; + const float bias0 = bias ? bias[p] : 0.f; int i = 0; for (; i + 7 < size; i += 8) diff --git a/src/layer/riscv/convolutiondepthwise_3x3.h b/src/layer/riscv/convolutiondepthwise_3x3.h index 30945d928dd5..8e44409a7ce8 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3.h +++ b/src/layer/riscv/convolutiondepthwise_3x3.h @@ -18,7 +18,7 @@ static void convdw3x3s1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& _k { Mat out = top_blob.channel(g); - const float bias0 = bias ? bias[g] : (__fp16)0.f; + const float bias0 = bias ? bias[g] : 0.f; const float* kernel0 = kernel + g * 9; @@ -133,7 +133,7 @@ static void convdw3x3s2_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& _k { Mat out = top_blob.channel(g); - const float bias0 = bias ? bias[g] : (__fp16)0.f; + const float bias0 = bias ? bias[g] : 0.f; const float* kernel0 = kernel + g * 9; From e53d0db87752d133b466d910693a362ef64db9b9 Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Mon, 2 Feb 2026 18:47:34 +0800 Subject: [PATCH 6/7] fix remaining issues --- .../riscv/deconvolution_packnto1_fp16s.h | 4 ++-- src/layer/riscv/interp_bicubic_packn_fp16s.h | 20 +++++++++---------- src/layer/riscv/interp_bilinear_packn_fp16s.h | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index 60c1727499d2..33b17c422ff1 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -94,7 +94,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl sum = activation_ss(sum, activation_type, activation_params); - outptr[j] = sum; + outptr[j] = (__fp16)sum; } outptr += outw; @@ -185,7 +185,7 @@ static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b sum = activation_ss(sum, activation_type, activation_params); - outptr[j] = sum; + outptr[j] = (__fp16)sum; } outptr += outw; diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h index c86d00033113..272a4d8ff1dd 100644 --- a/src/layer/riscv/interp_bicubic_packn_fp16s.h +++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h @@ -50,7 +50,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S31, vl), (__fp16)alphap[2], _S32, vl), (__fp16)alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); @@ -86,8 +86,8 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S21, vl), (__fp16)alphap[2], _S22, vl), (__fp16)alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S31, vl), (__fp16)alphap[2], _S32, vl), (__fp16)alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); @@ -132,9 +132,9 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S11, vl), (__fp16)alphap[2], _S12, vl), (__fp16)alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S21, vl), (__fp16)alphap[2], _S22, vl), (__fp16)alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S31, vl), (__fp16)alphap[2], _S32, vl), (__fp16)alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); @@ -180,10 +180,10 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S01, vl), (__fp16)alphap[2], _S02, vl), (__fp16)alphap[3], _S03, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S11, vl), (__fp16)alphap[2], _S12, vl), (__fp16)alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S21, vl), (__fp16)alphap[2], _S22, vl), (__fp16)alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S31, vl), (__fp16)alphap[2], _S32, vl), (__fp16)alphap[3], _S33, vl); __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h index 7e00c3efb62b..738f1b71c134 100644 --- a/src/layer/riscv/interp_bilinear_packn_fp16s.h +++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h @@ -43,7 +43,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S11, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); @@ -70,8 +70,8 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p + packn, vl); vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), alphap[1], _S01, vl); - vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), alphap[1], _S11, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S01, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, (__fp16)alphap[0], vl), (__fp16)alphap[1], _S11, vl); __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); From 28579889f138068b37449b91d69f3c1863ba380c Mon Sep 17 00:00:00 2001 From: simoneyuan Date: Tue, 3 Feb 2026 14:23:28 +0800 Subject: [PATCH 7/7] fix remaining issues --- src/layer/riscv/convolution1d_riscv_zfh.cpp | 2 +- src/layer/riscv/deconvolution_packnto1_fp16s.h | 2 +- src/layer/riscv/gru_riscv_zfh.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/convolution1d_riscv_zfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp index 403bb0cb675d..3ba70e1ea2e3 100644 --- a/src/layer/riscv/convolution1d_riscv_zfh.cpp +++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp @@ -471,7 +471,7 @@ int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, c sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); - sum = activation_ss(sum, activation_type, activation_params); + sum = (__fp16)activation_ss(sum, activation_type, activation_params); outptr[j] = sum; } diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index 33b17c422ff1..058dd784c11a 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -183,7 +183,7 @@ static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); - sum = activation_ss(sum, activation_type, activation_params); + sum = (__fp16)activation_ss(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } diff --git a/src/layer/riscv/gru_riscv_zfh.cpp b/src/layer/riscv/gru_riscv_zfh.cpp index ba2ba24166aa..281051dad67b 100644 --- a/src/layer/riscv/gru_riscv_zfh.cpp +++ b/src/layer/riscv/gru_riscv_zfh.cpp @@ -583,7 +583,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const float H = (1 - U) * N + U * hidden_state[q]; hidden_state[q] = H; - output_data[q] = H; + output_data[q] = (__fp16)H; } }