vad : add logging for weights to compare with org model

danbev · danbev · commit 2c7888806500 · 2025-04-08T08:53:04.000+02:00
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4484,9 +4484,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
 static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
     WHISPER_LOG_INFO("%s: building encoder layer\n", __func__);
-    // Reshape from the STFT output which is [258, 1, 1, 1] where are complex
-    // number pairs. I think we can ignore the imaginary part and just use the
-    // real part here.
+    // Reshape from the STFT output which is [258, 1, 1, 1] where the first
+    // dimension are complex number pairs. I think we can ignore the imaginary
+    // part and just use the real part here.
     struct ggml_tensor * real_part = ggml_view_1d(ctx0, cur, 129, 0);
     struct ggml_tensor * reshaped = ggml_reshape_3d(ctx0, real_part, 1, 129, 1);
 
@@ -5021,11 +5021,39 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
             return nullptr;
         }
 
-        struct ggml_tensor * tensor = model.stft_forward_basis;
-        std::vector<float> read_b(ggml_nbytes(tensor));
-        ggml_backend_tensor_get(tensor, read_b.data(), 0, ggml_nbytes(tensor));
-        for (int i = 0; i < 10; i++) {
-            WHISPER_LOG_INFO("%s: stft_forward_basis[%d]: %f\n", __func__, i, read_b[i]);
+        {
+            // Print as F32
+            struct ggml_tensor * tensor = model.stft_forward_basis;
+            std::vector<float> read_b(ggml_nbytes(tensor));
+            ggml_backend_tensor_get(tensor, read_b.data(), 0, ggml_nbytes(tensor));
+            for (int i = 0; i < 10; i++) {
+                WHISPER_LOG_INFO("%s: stft_forward_basis[%d]: %f\n", __func__, i, read_b[i]);
+            }
+        }
+
+
+        {
+            // Print as F16
+            struct ggml_tensor * tensor = model.encoder_0_weight;
+            std::vector<uint16_t> raw_data(ggml_nbytes(tensor) / sizeof(uint16_t));
+            ggml_backend_tensor_get(tensor, raw_data.data(), 0, ggml_nbytes(tensor));
+
+            // Convert first 10 values from F16 to F32 for display
+            for (int i = 0; i < 10; i++) {
+                float converted_value = ggml_fp16_to_fp32(raw_data[i]);
+                WHISPER_LOG_INFO("%s: model.encoder.0.reparam_conv: [%d]: %f (raw: 0x%04x)\n",
+                            __func__, i, converted_value, raw_data[i]);
+            }
+        }
+
+        {
+            // Print as F32
+            struct ggml_tensor * tensor = model.encoder_0_bias;
+            std::vector<float> read_b(ggml_nbytes(tensor));
+            ggml_backend_tensor_get(tensor, read_b.data(), 0, ggml_nbytes(tensor));
+            for (int i = 0; i < 10; i++) {
+                WHISPER_LOG_INFO("%s: encoder_0_bias: [%d]: %f\n", __func__, i, read_b[i]);
+            }
         }
     }
 
@@ -5122,7 +5150,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     }
     WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
     for (int i = 0; i < probs.size(); i++) {
-        WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
+        //WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
     }
 
     segments.n_segments = n_frames;