Skip to content

Commit a8f405b

Browse files
stevehuang52github-advanced-security[bot]
authored andcommitted
fix frame vad (NVIDIA-NeMo#14337)
* fix frame vad Signed-off-by: stevehuang52 <heh@nvidia.com> * clean up Signed-off-by: stevehuang52 <heh@nvidia.com> * clean up Signed-off-by: stevehuang52 <heh@nvidia.com> * add CI test Signed-off-by: stevehuang52 <heh@nvidia.com> * Potential fix for code scanning alert no. 15732: Unused import Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --------- Signed-off-by: stevehuang52 <heh@nvidia.com> Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: Guyue Huang <guyueh@nvidia.com>
1 parent 851b496 commit a8f405b

File tree

4 files changed

+131
-16
lines changed

4 files changed

+131
-16
lines changed

examples/asr/conf/marblenet/marblenet_3x2x64_20ms.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ model:
3636
max_gain_dbfs: 10.0
3737
noise:
3838
prob: 0.6
39-
manifest_path: /manifests/vad_noise/freesound_nonspeech_train_FL200.json
39+
manifest_path: ???
4040
min_snr_db: 0
4141
max_snr_db: 20
4242
max_gain_db: 300.0
@@ -51,15 +51,15 @@ model:
5151
pin_memory: true
5252
val_loss_idx: 0
5353

54-
test_ds:
55-
manifest_filepath: null
56-
sample_rate: ${model.sample_rate}
57-
labels: ${model.labels}
58-
batch_size: 128
59-
shuffle: False
60-
num_workers: 8
61-
pin_memory: true
62-
test_loss_idx: 0
54+
# test_ds:
55+
# manifest_filepath: null
56+
# sample_rate: ${model.sample_rate}
57+
# labels: ${model.labels}
58+
# batch_size: 128
59+
# shuffle: False
60+
# num_workers: 8
61+
# pin_memory: true
62+
# test_loss_idx: 0
6363

6464
preprocessor:
6565
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor

examples/asr/speech_classification/speech_to_frame_label.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
--config-path=<path to dir of configs e.g. "../conf/marblenet">
2323
--config-name=<name of config without .yaml e.g. "marblenet_3x2x64_20ms"> \
2424
model.train_ds.manifest_filepath="<path to train manifest>" \
25+
model.train_ds.augmentor.noise.manifest_path="<path to noise manifest>" \
2526
model.validation_ds.manifest_filepath=["<path to val manifest>","<path to test manifest>"] \
2627
trainer.devices=2 \
2728
trainer.accelerator="gpu" \

nemo/collections/asr/models/classification_models.py

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,19 +1041,27 @@ def _update_decoder_config(self, labels, cfg):
10411041
OmegaConf.set_struct(cfg, True)
10421042

10431043

1044-
class EncDecFrameClassificationModel(EncDecClassificationModel):
1045-
@property
1046-
def output_types(self) -> Optional[Dict[str, NeuralType]]:
1047-
return {"outputs": NeuralType(('B', 'T', 'C'), LogitsType())}
1044+
class EncDecFrameClassificationModel(_EncDecBaseModel):
1045+
"""
1046+
EncDecFrameClassificationModel is a model that performs classification on each frame of the input audio.
1047+
The default config (i.e., marblenet_3x2x64_20ms.yaml) outputs 20ms frames.
1048+
"""
10481049

10491050
def __init__(self, cfg: DictConfig, trainer: Trainer = None):
10501051
self.num_classes = len(cfg.labels)
10511052
self.eval_loop_cnt = 0
10521053
self.ratio_threshold = cfg.get('ratio_threshold', 0.2)
1054+
if cfg.get("is_regression_task", False):
1055+
raise ValueError("EndDecClassificationModel requires the flag is_regression_task to be set as false")
1056+
10531057
super().__init__(cfg=cfg, trainer=trainer)
10541058
self.decoder.output_types = self.output_types
10551059
self.decoder.output_types_for_export = self.output_types
10561060

1061+
@property
1062+
def output_types(self) -> Optional[Dict[str, NeuralType]]:
1063+
return {"outputs": NeuralType(('B', 'T', 'C'), LogitsType())}
1064+
10571065
@classmethod
10581066
def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]:
10591067
results = []
@@ -1065,6 +1073,32 @@ def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]:
10651073
results.append(model)
10661074
return results
10671075

1076+
def _setup_preprocessor(self):
1077+
return EncDecClassificationModel.from_config_dict(self._cfg.preprocessor)
1078+
1079+
def _setup_encoder(self):
1080+
return EncDecClassificationModel.from_config_dict(self._cfg.encoder)
1081+
1082+
def _setup_decoder(self):
1083+
return EncDecClassificationModel.from_config_dict(self._cfg.decoder)
1084+
1085+
def _update_decoder_config(self, labels, cfg):
1086+
"""
1087+
Update the number of classes in the decoder based on labels provided.
1088+
1089+
Args:
1090+
labels: The current labels of the model
1091+
cfg: The config of the decoder which will be updated.
1092+
"""
1093+
OmegaConf.set_struct(cfg, False)
1094+
1095+
if 'params' in cfg:
1096+
cfg.params.num_classes = len(labels)
1097+
else:
1098+
cfg.num_classes = len(labels)
1099+
1100+
OmegaConf.set_struct(cfg, True)
1101+
10681102
def _setup_metrics(self):
10691103
self._accuracy = TopKClassificationAccuracy(dist_sync_on_step=True)
10701104
self._macro_accuracy = Accuracy(num_classes=self.num_classes, average='macro', task="multiclass")
@@ -1226,14 +1260,26 @@ def validation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str =
12261260
self._macro_accuracy.update(preds=metric_logits, target=metric_labels)
12271261
stats = self._macro_accuracy._final_state()
12281262

1229-
return {
1263+
output = {
12301264
f'{tag}_loss': loss_value,
12311265
f'{tag}_correct_counts': correct_counts,
12321266
f'{tag}_total_counts': total_counts,
12331267
f'{tag}_acc_micro': acc,
12341268
f'{tag}_acc_stats': stats,
12351269
}
12361270

1271+
if tag == 'val':
1272+
if isinstance(self.trainer.val_dataloaders, (list, tuple)) and len(self.trainer.val_dataloaders) > 1:
1273+
self.validation_step_outputs[dataloader_idx].append(output)
1274+
else:
1275+
self.validation_step_outputs.append(output)
1276+
else:
1277+
if isinstance(self.trainer.test_dataloaders, (list, tuple)) and len(self.trainer.test_dataloaders) > 1:
1278+
self.test_step_outputs[dataloader_idx].append(output)
1279+
else:
1280+
self.test_step_outputs.append(output)
1281+
return output
1282+
12371283
def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, tag: str = 'val'):
12381284
val_loss_mean = torch.stack([x[f'{tag}_loss'] for x in outputs]).mean()
12391285
correct_counts = torch.stack([x[f'{tag}_correct_counts'] for x in outputs]).sum(axis=0)

tests/collections/asr/test_asr_classification_model.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,15 @@
1313
# limitations under the License.
1414

1515
import copy
16+
import json
1617
import os
1718

19+
import tempfile
20+
21+
import lightning.pytorch as pl
22+
import numpy as np
1823
import pytest
24+
import soundfile as sf
1925
import torch
2026
from omegaconf import DictConfig, ListConfig
2127

@@ -104,12 +110,20 @@ def frame_classification_model():
104110
},
105111
}
106112

113+
optim = {
114+
'name': 'sgd',
115+
'lr': 0.01,
116+
'weight_decay': 0.001,
117+
'momentum': 0.9,
118+
}
119+
107120
modelConfig = DictConfig(
108121
{
109122
'preprocessor': DictConfig(preprocessor),
110123
'encoder': DictConfig(encoder),
111124
'decoder': DictConfig(decoder),
112-
'labels': ListConfig(["dummy_cls_{}".format(i + 1) for i in range(5)]),
125+
'optim': DictConfig(optim),
126+
'labels': ListConfig(["0", "1"]),
113127
}
114128
)
115129
model = EncDecFrameClassificationModel(cfg=modelConfig)
@@ -320,3 +334,57 @@ def test_EncDecClassificationDatasetConfig_for_AudioToMultiSpeechLabelDataset(se
320334
assert signatures_match
321335
assert cls_subset is None
322336
assert dataclass_subset is None
337+
338+
@pytest.mark.unit
339+
def test_frame_classification_model(self, frame_classification_model: EncDecFrameClassificationModel):
340+
with tempfile.TemporaryDirectory() as temp_dir:
341+
# generate random audio
342+
audio = np.random.randn(16000 * 1)
343+
# save the audio
344+
audio_path = os.path.join(temp_dir, "audio.wav")
345+
sf.write(audio_path, audio, 16000)
346+
347+
dummy_labels = "0 0 0 0 1 1 1 1 0 0 0 0"
348+
349+
dummy_sample = {
350+
"audio_filepath": audio_path,
351+
"offset": 0.0,
352+
"duration": 1.0,
353+
"label": dummy_labels,
354+
}
355+
356+
# create a manifest file
357+
manifest_path = os.path.join(temp_dir, "dummy_manifest.json")
358+
with open(manifest_path, "w") as f:
359+
for i in range(4):
360+
f.write(json.dumps(dummy_sample) + "\n")
361+
362+
dataloader_cfg = {
363+
"batch_size": 2,
364+
"manifest_filepath": manifest_path,
365+
"sample_rate": 16000,
366+
"num_workers": 0,
367+
"shuffle": False,
368+
"labels": ["0", "1"],
369+
}
370+
371+
trainer_cfg = {
372+
"max_epochs": 1,
373+
"devices": 1,
374+
"accelerator": "auto",
375+
}
376+
377+
optim = {
378+
'name': 'sgd',
379+
'lr': 0.01,
380+
'weight_decay': 0.001,
381+
'momentum': 0.9,
382+
}
383+
384+
trainer = pl.Trainer(**trainer_cfg)
385+
frame_classification_model.set_trainer(trainer)
386+
frame_classification_model.setup_optimization(DictConfig(optim))
387+
frame_classification_model.setup_training_data(dataloader_cfg)
388+
frame_classification_model.setup_validation_data(dataloader_cfg)
389+
390+
trainer.fit(frame_classification_model)

0 commit comments

Comments
 (0)