Skip to content

Commit 3306edc

Browse files
committed
add jitter buffer for voice
1 parent aeebed7 commit 3306edc

File tree

6 files changed

+109
-10
lines changed

6 files changed

+109
-10
lines changed

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,11 @@ For example, memory_db would be set by adding `memory_db = true` under the line
330330
331331
#### voice
332332
333-
| Setting | Type | Default | Description |
334-
|---------|--------|------------------------------------|------------------------------------------------------------|
335-
| `vad` | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI |
333+
| Setting | Type | Default | Description |
334+
|--------------------------|--------|------------------------------------|---------------------------------------------------------------------------------|
335+
| `vad` | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI |
336+
| `jitter_latency_desired` | int | 50 | Desired/Minimum latency for jitter buffer (in milliseconds) |
337+
| `jitter_latency_maximum` | int | 200 | Maximum latency for jitter buffer before frames are discarded (in milliseconds) |
336338
337339
#### windows
338340

src/audio/jitterbuffer.hpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#pragma once
2+
#include <chrono>
3+
#include <cstdint>
4+
#include <deque>
5+
6+
// very simple non-RTP-based jitter buffer. does not handle out-of-order
7+
template<typename SampleFormat>
8+
class JitterBuffer {
9+
public:
10+
/*
11+
* desired_latency: how many milliseconds before audio can be drawn from buffer
12+
* maximum_latency: how many milliseconds before old audio starts to be discarded
13+
*/
14+
JitterBuffer(int desired_latency, int maximum_latency, int channels, int sample_rate)
15+
: m_desired_latency(desired_latency)
16+
, m_maximum_latency(maximum_latency)
17+
, m_channels(channels)
18+
, m_sample_rate(sample_rate)
19+
, m_last_push(std::chrono::steady_clock::now()) {
20+
}
21+
22+
[[nodiscard]] size_t Available() const noexcept {
23+
return m_samples.size();
24+
}
25+
26+
bool PopSamples(SampleFormat *ptr, size_t amount) {
27+
CheckBuffering();
28+
if (m_buffering || Available() < amount) return false;
29+
std::copy(m_samples.begin(), m_samples.begin() + amount, ptr);
30+
m_samples.erase(m_samples.begin(), m_samples.begin() + amount);
31+
return true;
32+
}
33+
34+
void PushSamples(SampleFormat *ptr, size_t amount) {
35+
m_samples.insert(m_samples.end(), ptr, ptr + amount);
36+
m_last_push = std::chrono::steady_clock::now();
37+
const auto buffered = MillisBuffered();
38+
if (buffered > m_maximum_latency) {
39+
const auto overflow_ms = MillisBuffered() - m_maximum_latency;
40+
const auto overflow_samples = overflow_ms * m_channels * m_sample_rate / 1000;
41+
m_samples.erase(m_samples.begin(), m_samples.begin() + overflow_samples);
42+
}
43+
}
44+
45+
private:
46+
[[nodiscard]] size_t MillisBuffered() const {
47+
return m_samples.size() * 1000 / m_channels / m_sample_rate;
48+
}
49+
50+
void CheckBuffering() {
51+
// if we arent buffering but the buffer is empty then we should be
52+
if (m_samples.empty()) {
53+
if (!m_buffering) {
54+
m_buffering = true;
55+
}
56+
return;
57+
}
58+
59+
if (!m_buffering) return;
60+
61+
// if we reached desired latency, we are sufficiently buffered
62+
const auto millis_buffered = MillisBuffered();
63+
if (millis_buffered >= m_desired_latency) {
64+
m_buffering = false;
65+
}
66+
// if we havent buffered to desired latency but max latency has elapsed, exit buffering so it doesnt get stuck
67+
const auto now = std::chrono::steady_clock::now();
68+
const auto millis = std::chrono::duration_cast<std::chrono::milliseconds>(now - m_last_push).count();
69+
if (millis >= m_maximum_latency) {
70+
m_buffering = false;
71+
}
72+
}
73+
74+
int m_desired_latency;
75+
int m_maximum_latency;
76+
int m_channels;
77+
int m_sample_rate;
78+
bool m_buffering = true;
79+
std::chrono::time_point<std::chrono::steady_clock> m_last_push;
80+
81+
std::deque<SampleFormat> m_samples;
82+
};

src/audio/manager.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const uint8_t *StripRTPExtensionHeader(const uint8_t *buf, int num_bytes, size_t
2525
return buf;
2626
}
2727

28+
// frameCount is configured to be 480 samples per channel
2829
void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uint32 frameCount) {
2930
AudioManager *mgr = reinterpret_cast<AudioManager *>(pDevice->pUserData);
3031
if (mgr == nullptr) return;
@@ -36,12 +37,14 @@ void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uin
3637
if (const auto vol_it = mgr->m_volume_ssrc.find(ssrc); vol_it != mgr->m_volume_ssrc.end()) {
3738
volume = vol_it->second;
3839
}
39-
auto &buf = pair.first;
40-
const size_t n = std::min(static_cast<size_t>(buf.size()), static_cast<size_t>(frameCount * 2ULL));
41-
for (size_t i = 0; i < n; i++) {
40+
41+
static std::array<int16_t, 480 * 2> buf;
42+
43+
if (!pair.first.PopSamples(buf.data(), 480 * 2)) continue;
44+
45+
for (size_t i = 0; i < 480 * 2; i++) {
4246
pOutputF32[i] += volume * buf[i] / 32768.F;
4347
}
44-
buf.erase(buf.begin(), buf.begin() + n);
4548
}
4649
}
4750

@@ -201,7 +204,14 @@ void AudioManager::AddSSRC(uint32_t ssrc) {
201204
int error;
202205
if (m_sources.find(ssrc) == m_sources.end()) {
203206
auto *decoder = opus_decoder_create(48000, 2, &error);
204-
m_sources.insert(std::make_pair(ssrc, std::make_pair(std::deque<int16_t> {}, decoder)));
207+
auto &s = Abaddon::Get().GetSettings();
208+
m_sources.insert(std::make_pair(ssrc, std::make_pair(
209+
JitterBuffer<int16_t>(
210+
s.JitterDesiredLatency,
211+
s.JitterMaximumLatency,
212+
2,
213+
48000),
214+
decoder)));
205215
}
206216
}
207217

@@ -241,7 +251,7 @@ void AudioManager::FeedMeOpus(uint32_t ssrc, const std::vector<uint8_t> &data) {
241251
} else {
242252
UpdateReceiveVolume(ssrc, pcm.data(), decoded);
243253
auto &buf = it->second.first;
244-
buf.insert(buf.end(), pcm.begin(), pcm.begin() + decoded * 2);
254+
buf.PushSamples(pcm.data(), decoded * 2);
245255
}
246256
}
247257
}

src/audio/manager.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#endif
2222

2323
#include "devices.hpp"
24+
#include "jitterbuffer.hpp"
2425
// clang-format on
2526

2627
class AudioManager {
@@ -136,7 +137,7 @@ class AudioManager {
136137
mutable std::mutex m_rnn_mutex;
137138
#endif
138139

139-
std::unordered_map<uint32_t, std::pair<std::deque<int16_t>, OpusDecoder *>> m_sources;
140+
std::unordered_map<uint32_t, std::pair<JitterBuffer<int16_t>, OpusDecoder *>> m_sources;
140141

141142
OpusEncoder *m_encoder;
142143

src/settings.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ void SettingsManager::DefineSettings() {
130130
AddSetting("voice", "vad", "gate"s, &Settings::VAD);
131131
#endif
132132
AddSetting("voice", "backends", ""s, &Settings::Backends);
133+
AddSetting("voice", "jitter_latency_desired", 50, &Settings::JitterDesiredLatency);
134+
AddSetting("voice", "jitter_latency_maximum", 200, &Settings::JitterMaximumLatency);
133135
}
134136

135137
void SettingsManager::ReadSettings() {

src/settings.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ class SettingsManager {
5252
// [voice]
5353
std::string VAD;
5454
std::string Backends;
55+
int JitterDesiredLatency;
56+
int JitterMaximumLatency;
5557

5658
// [windows]
5759
bool HideConsole;

0 commit comments

Comments
 (0)