-
Notifications
You must be signed in to change notification settings - Fork 31.7k
Add dithering to the Speech2TextFeatureExtractor API.
#34638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
696c984
9d9382b
b9a39a7
c2087ea
2422fdb
042699d
8658a70
dda4695
c182442
2fd31df
7264638
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -390,6 +390,7 @@ def spectrogram( | |||||
| center: bool = True, | ||||||
| pad_mode: str = "reflect", | ||||||
| onesided: bool = True, | ||||||
| dither: float = 0.0, | ||||||
| preemphasis: Optional[float] = None, | ||||||
| mel_filters: Optional[np.ndarray] = None, | ||||||
| mel_floor: float = 1e-10, | ||||||
|
|
@@ -460,6 +461,10 @@ def spectrogram( | |||||
| onesided (`bool`, *optional*, defaults to `True`): | ||||||
| If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1` | ||||||
| frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins. | ||||||
| dither (`float`, *optional*, defaults to 0.0): | ||||||
| Adds dithering. In other words, adds a small Gaussian noise to each frame. | ||||||
| E.g. use 4.0 to add dithering with a normal distribution centered | ||||||
| around 0.0 with standard deviation 4.0, 0.0 means no dithering. | ||||||
| preemphasis (`float`, *optional*) | ||||||
| Coefficient for a low-pass filter that applies pre-emphasis before the DFT. | ||||||
| mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*): | ||||||
|
|
@@ -540,6 +545,9 @@ def spectrogram( | |||||
| for frame_idx in range(num_frames): | ||||||
| buffer[:frame_length] = waveform[timestep : timestep + frame_length] | ||||||
|
|
||||||
| if dither != 0.0: | ||||||
| buffer[:frame_length] += dither * np.random.randn(*buffer[:frame_length].shape) | ||||||
|
||||||
| buffer[:frame_length] += dither * np.random.randn(*buffer[:frame_length].shape) | |
| buffer[:frame_length] += dither * np.random.randn(frame_length) |
Or maybe len(buffer) ?
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| dither (`float`): | |
| dither (`float`, *optional*, defaults to 0.0): |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,7 +53,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None): | |
| @require_torch | ||
| @require_torchaudio | ||
| # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap | ||
| class ClapFeatureExtractionTester(unittest.TestCase): | ||
|
||
| class ClapFeatureExtractionTester: | ||
| def __init__( | ||
| self, | ||
| parent, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,7 +48,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None): | |
|
|
||
| @require_torch | ||
| @require_torchaudio | ||
| class Speech2TextFeatureExtractionTester(unittest.TestCase): | ||
| class Speech2TextFeatureExtractionTester: | ||
| def __init__( | ||
| self, | ||
| parent, | ||
|
|
@@ -144,6 +144,38 @@ def test_call(self): | |
| for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): | ||
| self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) | ||
|
|
||
| def test_dither(self): | ||
| # Tests that features with and without little dithering are similar, but not the same | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's set the seed here, to ensure reproducibility. |
||
| dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict() | ||
| dict_no_dither["dither"] = 0.0 | ||
|
|
||
| dict_dither = self.feat_extract_tester.prepare_feat_extract_dict() | ||
| dict_dither["dither"] = 1.0 | ||
|
|
||
| feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither) | ||
| feature_extractor_dither = self.feature_extraction_class(**dict_dither) | ||
|
|
||
| # create three inputs of length 800, 1000, and 1200 | ||
| speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] | ||
| np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] | ||
|
|
||
| # compute features | ||
| input_features_no_dither = feature_extractor_no_dither( | ||
| np_speech_inputs, padding=True, return_tensors="np" | ||
| ).input_features | ||
| input_features_dither = feature_extractor_dither( | ||
| np_speech_inputs, padding=True, return_tensors="np" | ||
| ).input_features | ||
|
|
||
| # test there is a difference between features (there's added noise to input signal) | ||
| diff = input_features_dither - input_features_no_dither | ||
|
|
||
| # features are not identical | ||
| self.assertTrue(np.abs(diff).mean() > 1e-5) | ||
| # features are not too different | ||
| self.assertTrue(np.abs(diff).mean() <= 1e-3) | ||
| self.assertTrue(np.abs(diff).max() <= 1e-2) | ||
|
Comment on lines
+178
to
+179
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great! |
||
|
|
||
| def test_cepstral_mean_and_variance_normalization(self): | ||
| feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) | ||
| speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's add the comment about "this can help for hard audio in ASR" 😉
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, added the explanatory comments, thanks!