add jitter buffer for voice

ouwou · ouwou · commit 3306edc514a9 · 2024-03-25T00:38:31.000-04:00
diff --git a/README.md b/README.md
@@ -330,9 +330,11 @@ For example, memory_db would be set by adding `memory_db = true` under the line
 
 #### voice
 
-| Setting | Type   | Default                            | Description                                                |
-|---------|--------|------------------------------------|------------------------------------------------------------|
-| `vad`   | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI |
+| Setting                  | Type   | Default                            | Description                                                                     |
+|--------------------------|--------|------------------------------------|---------------------------------------------------------------------------------|
+| `vad`                    | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI                      |
+| `jitter_latency_desired` | int    | 50                                 | Desired/Minimum latency for jitter buffer (in milliseconds)                     |
+| `jitter_latency_maximum` | int    | 200                                | Maximum latency for jitter buffer before frames are discarded (in milliseconds) |
 
 #### windows
 
diff --git a/src/audio/jitterbuffer.hpp b/src/audio/jitterbuffer.hpp
@@ -0,0 +1,82 @@
+#pragma once
+#include <chrono>
+#include <cstdint>
+#include <deque>
+
+// very simple non-RTP-based jitter buffer. does not handle out-of-order
+template<typename SampleFormat>
+class JitterBuffer {
+public:
+    /*
+     * desired_latency: how many milliseconds before audio can be drawn from buffer
+     * maximum_latency: how many milliseconds before old audio starts to be discarded
+     */
+    JitterBuffer(int desired_latency, int maximum_latency, int channels, int sample_rate)
+        : m_desired_latency(desired_latency)
+        , m_maximum_latency(maximum_latency)
+        , m_channels(channels)
+        , m_sample_rate(sample_rate)
+        , m_last_push(std::chrono::steady_clock::now()) {
+    }
+
+    [[nodiscard]] size_t Available() const noexcept {
+        return m_samples.size();
+    }
+
+    bool PopSamples(SampleFormat *ptr, size_t amount) {
+        CheckBuffering();
+        if (m_buffering || Available() < amount) return false;
+        std::copy(m_samples.begin(), m_samples.begin() + amount, ptr);
+        m_samples.erase(m_samples.begin(), m_samples.begin() + amount);
+        return true;
+    }
+
+    void PushSamples(SampleFormat *ptr, size_t amount) {
+        m_samples.insert(m_samples.end(), ptr, ptr + amount);
+        m_last_push = std::chrono::steady_clock::now();
+        const auto buffered = MillisBuffered();
+        if (buffered > m_maximum_latency) {
+            const auto overflow_ms = MillisBuffered() - m_maximum_latency;
+            const auto overflow_samples = overflow_ms * m_channels * m_sample_rate / 1000;
+            m_samples.erase(m_samples.begin(), m_samples.begin() + overflow_samples);
+        }
+    }
+
+private:
+    [[nodiscard]] size_t MillisBuffered() const {
+        return m_samples.size() * 1000 / m_channels / m_sample_rate;
+    }
+
+    void CheckBuffering() {
+        // if we arent buffering but the buffer is empty then we should be
+        if (m_samples.empty()) {
+            if (!m_buffering) {
+                m_buffering = true;
+            }
+            return;
+        }
+
+        if (!m_buffering) return;
+
+        // if we reached desired latency, we are sufficiently buffered
+        const auto millis_buffered = MillisBuffered();
+        if (millis_buffered >= m_desired_latency) {
+            m_buffering = false;
+        }
+        // if we havent buffered to desired latency but max latency has elapsed, exit buffering so it doesnt get stuck
+        const auto now = std::chrono::steady_clock::now();
+        const auto millis = std::chrono::duration_cast<std::chrono::milliseconds>(now - m_last_push).count();
+        if (millis >= m_maximum_latency) {
+            m_buffering = false;
+        }
+    }
+
+    int m_desired_latency;
+    int m_maximum_latency;
+    int m_channels;
+    int m_sample_rate;
+    bool m_buffering = true;
+    std::chrono::time_point<std::chrono::steady_clock> m_last_push;
+
+    std::deque<SampleFormat> m_samples;
+};
diff --git a/src/audio/manager.cpp b/src/audio/manager.cpp
@@ -25,6 +25,7 @@ const uint8_t *StripRTPExtensionHeader(const uint8_t *buf, int num_bytes, size_t
     return buf;
 }
 
+// frameCount is configured to be 480 samples per channel
 void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uint32 frameCount) {
     AudioManager *mgr = reinterpret_cast<AudioManager *>(pDevice->pUserData);
     if (mgr == nullptr) return;
@@ -36,12 +37,14 @@ void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uin
         if (const auto vol_it = mgr->m_volume_ssrc.find(ssrc); vol_it != mgr->m_volume_ssrc.end()) {
             volume = vol_it->second;
         }
-        auto &buf = pair.first;
-        const size_t n = std::min(static_cast<size_t>(buf.size()), static_cast<size_t>(frameCount * 2ULL));
-        for (size_t i = 0; i < n; i++) {
+
+        static std::array<int16_t, 480 * 2> buf;
+
+        if (!pair.first.PopSamples(buf.data(), 480 * 2)) continue;
+
+        for (size_t i = 0; i < 480 * 2; i++) {
             pOutputF32[i] += volume * buf[i] / 32768.F;
         }
-        buf.erase(buf.begin(), buf.begin() + n);
     }
 }
 
@@ -201,7 +204,14 @@ void AudioManager::AddSSRC(uint32_t ssrc) {
     int error;
     if (m_sources.find(ssrc) == m_sources.end()) {
         auto *decoder = opus_decoder_create(48000, 2, &error);
-        m_sources.insert(std::make_pair(ssrc, std::make_pair(std::deque<int16_t> {}, decoder)));
+        auto &s = Abaddon::Get().GetSettings();
+        m_sources.insert(std::make_pair(ssrc, std::make_pair(
+                                                  JitterBuffer<int16_t>(
+                                                      s.JitterDesiredLatency,
+                                                      s.JitterMaximumLatency,
+                                                      2,
+                                                      48000),
+                                                  decoder)));
     }
 }
 
@@ -241,7 +251,7 @@ void AudioManager::FeedMeOpus(uint32_t ssrc, const std::vector<uint8_t> &data) {
         } else {
             UpdateReceiveVolume(ssrc, pcm.data(), decoded);
             auto &buf = it->second.first;
-            buf.insert(buf.end(), pcm.begin(), pcm.begin() + decoded * 2);
+            buf.PushSamples(pcm.data(), decoded * 2);
         }
     }
 }
diff --git a/src/audio/manager.hpp b/src/audio/manager.hpp
@@ -21,6 +21,7 @@
 #endif
 
 #include "devices.hpp"
+#include "jitterbuffer.hpp"
 // clang-format on
 
 class AudioManager {
@@ -136,7 +137,7 @@ class AudioManager {
     mutable std::mutex m_rnn_mutex;
 #endif
 
-    std::unordered_map<uint32_t, std::pair<std::deque<int16_t>, OpusDecoder *>> m_sources;
+    std::unordered_map<uint32_t, std::pair<JitterBuffer<int16_t>, OpusDecoder *>> m_sources;
 
     OpusEncoder *m_encoder;
 
diff --git a/src/settings.cpp b/src/settings.cpp
@@ -130,6 +130,8 @@ void SettingsManager::DefineSettings() {
     AddSetting("voice", "vad", "gate"s, &Settings::VAD);
 #endif
     AddSetting("voice", "backends", ""s, &Settings::Backends);
+    AddSetting("voice", "jitter_latency_desired", 50, &Settings::JitterDesiredLatency);
+    AddSetting("voice", "jitter_latency_maximum", 200, &Settings::JitterMaximumLatency);
 }
 
 void SettingsManager::ReadSettings() {
diff --git a/src/settings.hpp b/src/settings.hpp
@@ -52,6 +52,8 @@ class SettingsManager {
         // [voice]
         std::string VAD;
         std::string Backends;
+        int JitterDesiredLatency;
+        int JitterMaximumLatency;
 
         // [windows]
         bool HideConsole;

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ const uint8_t StripRTPExtensionHeader(const uint8_t buf, int num_bytes, size_t`
`25`	`25`	`return buf;`
`26`	`26`	`}`
`27`	`27`
	`28`	`+// frameCount is configured to be 480 samples per channel`
`28`	`29`	`void data_callback(ma_device pDevice, void pOutput, const void *pInput, ma_uint32 frameCount) {`
`29`	`30`	`AudioManager mgr = reinterpret_cast<AudioManager >(pDevice->pUserData);`
`30`	`31`	`if (mgr == nullptr) return;`
`@@ -36,12 +37,14 @@ void data_callback(ma_device pDevice, void pOutput, const void *pInput, ma_uin`
`36`	`37`	`if (const auto vol_it = mgr->m_volume_ssrc.find(ssrc); vol_it != mgr->m_volume_ssrc.end()) {`
`37`	`38`	`volume = vol_it->second;`
`38`	`39`	`}`
`39`		`- auto &buf = pair.first;`
`40`		`- const size_t n = std::min(static_cast<size_t>(buf.size()), static_cast<size_t>(frameCount * 2ULL));`
`41`		`- for (size_t i = 0; i < n; i++) {`
	`40`	`+`
	`41`	`+ static std::array<int16_t, 480 * 2> buf;`
	`42`	`+`
	`43`	`+ if (!pair.first.PopSamples(buf.data(), 480 * 2)) continue;`
	`44`	`+`
	`45`	`+ for (size_t i = 0; i < 480 * 2; i++) {`
`42`	`46`	`pOutputF32[i] += volume * buf[i] / 32768.F;`
`43`	`47`	`}`
`44`		`- buf.erase(buf.begin(), buf.begin() + n);`
`45`	`48`	`}`
`46`	`49`	`}`
`47`	`50`
`@@ -201,7 +204,14 @@ void AudioManager::AddSSRC(uint32_t ssrc) {`
`201`	`204`	`int error;`
`202`	`205`	`if (m_sources.find(ssrc) == m_sources.end()) {`
`203`	`206`	`auto *decoder = opus_decoder_create(48000, 2, &error);`
`204`		`- m_sources.insert(std::make_pair(ssrc, std::make_pair(std::deque<int16_t> {}, decoder)));`
	`207`	`+ auto &s = Abaddon::Get().GetSettings();`
	`208`	`+ m_sources.insert(std::make_pair(ssrc, std::make_pair(`
	`209`	`+ JitterBuffer<int16_t>(`
	`210`	`+ s.JitterDesiredLatency,`
	`211`	`+ s.JitterMaximumLatency,`
	`212`	`+ 2,`
	`213`	`+ 48000),`
	`214`	`+ decoder)));`
`205`	`215`	`}`
`206`	`216`	`}`
`207`	`217`
`@@ -241,7 +251,7 @@ void AudioManager::FeedMeOpus(uint32_t ssrc, const std::vector<uint8_t> &data) {`
`241`	`251`	`} else {`
`242`	`252`	`UpdateReceiveVolume(ssrc, pcm.data(), decoded);`
`243`	`253`	`auto &buf = it->second.first;`
`244`		`- buf.insert(buf.end(), pcm.begin(), pcm.begin() + decoded * 2);`
	`254`	`+ buf.PushSamples(pcm.data(), decoded * 2);`
`245`	`255`	`}`
`246`	`256`	`}`
`247`	`257`	`}`
Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,8 @@ void SettingsManager::DefineSettings() {`
`130`	`130`	`AddSetting("voice", "vad", "gate"s, &Settings::VAD);`
`131`	`131`	`#endif`
`132`	`132`	`AddSetting("voice", "backends", ""s, &Settings::Backends);`
	`133`	`+ AddSetting("voice", "jitter_latency_desired", 50, &Settings::JitterDesiredLatency);`
	`134`	`+ AddSetting("voice", "jitter_latency_maximum", 200, &Settings::JitterMaximumLatency);`
`133`	`135`	`}`
`134`	`136`
`135`	`137`	`void SettingsManager::ReadSettings() {`