Spaces:
Runtime error
Runtime error
| # This config contains the default values for training a modified ContextNet model with Transducer loss and BPE-based vocabulary. | |
| # In contrast to original ContextNet, the same number of filters is used throughout the model. | |
| # Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs. | |
| # To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. | |
| # It contains the default values for training a ContextNet ASR model, large size (~144M) with Transducer loss and sub-word encoding. | |
| # Architecture and training config: | |
| # Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective | |
| # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. | |
| # Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file. | |
| # | |
| # +-------------+---------+------------+ | |
| # | Model | filters | time_masks | | |
| # +=============+=========+============+ | |
| # | Small (14M)| 256 | 2 | | |
| # +-------------+---------+------------+ | |
| # | Medium (40M)| 512 | 5 | | |
| # +-------------+---------+------------+ | |
| # | Large (145M)| 1024 | 10 | | |
| # +------------------------------------- | |
| name: &name "ContextNet-8x-Stride-RNNT" | |
| model: | |
| sample_rate: 16000 | |
| compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. | |
| train_ds: | |
| manifest_filepath: ??? | |
| sample_rate: ${model.sample_rate} | |
| batch_size: 16 # Can be increased if memory allows or when using smaller model | |
| trim_silence: false | |
| max_duration: 16.7 | |
| shuffle: true | |
| use_start_end_token: false | |
| num_workers: 16 | |
| pin_memory: true | |
| # tarred datasets | |
| is_tarred: false | |
| tarred_audio_filepaths: null | |
| tarred_shard_strategy: "scatter" | |
| shuffle_n: 2048 | |
| # bucketing params | |
| bucketing_strategy: "synced_randomized" | |
| bucketing_batch_size: null | |
| validation_ds: | |
| manifest_filepath: ??? | |
| sample_rate: ${model.sample_rate} | |
| batch_size: 8 | |
| shuffle: false | |
| use_start_end_token: false | |
| num_workers: 16 | |
| pin_memory: true | |
| test_ds: | |
| manifest_filepath: null | |
| sample_rate: ${model.sample_rate} | |
| batch_size: 8 | |
| shuffle: false | |
| use_start_end_token: false | |
| num_workers: 16 | |
| pin_memory: true | |
| model_defaults: | |
| filters: 1024 | |
| repeat: 5 | |
| dropout: 0.1 | |
| separable: true | |
| se: true | |
| se_context_size: -1 | |
| kernel_size_factor: 1.0 | |
| # encoder / decoder / joint values | |
| enc_hidden: 640 | |
| pred_hidden: 640 | |
| joint_hidden: 640 | |
| tokenizer: | |
| dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) | |
| type: ??? # Can be either bpe or wpe | |
| preprocessor: | |
| _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | |
| sample_rate: ${model.sample_rate} | |
| normalize: "per_feature" | |
| window_size: 0.025 | |
| window_stride: 0.01 | |
| window: "hann" | |
| features: &n_mels 80 | |
| n_fft: 512 | |
| frame_splicing: 1 | |
| dither: 0.00001 | |
| pad_to: 16 | |
| stft_conv: false | |
| spec_augment: | |
| _target_: nemo.collections.asr.modules.SpectrogramAugmentation | |
| freq_masks: 2 # should be kept at 2 | |
| time_masks: 10 # can be 5 for small-med models, 10 for larger models. | |
| freq_width: 27 | |
| time_width: 0.05 | |
| encoder: | |
| _target_: nemo.collections.asr.modules.ConvASREncoder | |
| feat_in: | |
| activation: swish | |
| conv_mask: true | |
| init_mode: "tds_uniform" | |
| jasper: | |
| - filters: ${model.model_defaults.filters} | |
| repeat: 1 | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: 0.0 | |
| residual: false | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [2] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| stride_last: true | |
| residual_mode: "stride_add" | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [2] # *stride | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| stride_last: true | |
| residual_mode: "stride_add" | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [2] # stride | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| stride_last: true | |
| residual_mode: "stride_add" | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.filters} | |
| repeat: ${model.model_defaults.repeat} | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: ${model.model_defaults.dropout} | |
| residual: true | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| - filters: ${model.model_defaults.enc_hidden} | |
| repeat: 1 | |
| kernel: [5] | |
| stride: [1] | |
| dilation: [1] | |
| dropout: 0.0 | |
| residual: false | |
| separable: ${model.model_defaults.separable} | |
| se: ${model.model_defaults.se} | |
| se_context_size: ${model.model_defaults.se_context_size} | |
| kernel_size_factor: ${model.model_defaults.kernel_size_factor} | |
| decoder: | |
| _target_: nemo.collections.asr.modules.RNNTDecoder | |
| normalization_mode: null # Currently only null is supported for export. | |
| random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf | |
| blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. | |
| prednet: | |
| pred_hidden: ${model.model_defaults.pred_hidden} | |
| pred_rnn_layers: 1 # only 1 layer LSTM networks are exportable. | |
| t_max: null # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default. | |
| dropout: 0.1 | |
| joint: | |
| _target_: nemo.collections.asr.modules.RNNTJoint | |
| log_softmax: null # sets it according to cpu/gpu device | |
| preserve_memory: false # dramatically slows down training, but might preserve some memory | |
| # Fuses the computation of prediction net + joint net + loss + WER calculation | |
| # to be run on sub-batches of size `fused_batch_size`. | |
| # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. | |
| # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. | |
| # Using small values here will preserve a lot of memory during training, but will make training slower as well. | |
| # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. | |
| # However, to preserve memory, this ratio can be 1:8 or even 1:16. | |
| # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. | |
| fuse_loss_wer: true | |
| fused_batch_size: 4 | |
| jointnet: | |
| joint_hidden: ${model.model_defaults.joint_hidden} | |
| activation: "relu" | |
| dropout: 0.1 | |
| # RNNT decoding strategy | |
| decoding: | |
| strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. | |
| # greedy strategy config | |
| greedy: | |
| max_symbols: 10 | |
| # beam strategy config | |
| beam: | |
| beam_size: 4 | |
| score_norm: true | |
| return_best_hypothesis: False | |
| softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax | |
| tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0 | |
| alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0 | |
| maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0 | |
| maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0 | |
| maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0 | |
| maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0 | |
| # RNNT loss config | |
| loss: | |
| loss_name: "default" | |
| warprnnt_numba_kwargs: | |
| # FastEmit regularization: https://arxiv.org/abs/2010.11148 | |
| fastemit_lambda: 0.001 # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start. | |
| clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. | |
| optim: | |
| name: novograd | |
| lr: 0.05 | |
| # optimizer arguments | |
| betas: [0.9, 0.0] | |
| weight_decay: 0.001 | |
| # scheduler setup | |
| sched: | |
| name: CosineAnnealing | |
| # scheduler config override | |
| warmup_steps: 5000 | |
| warmup_ratio: null | |
| min_lr: 1e-6 | |
| last_epoch: -1 | |
| trainer: | |
| devices: 1 # number of gpus | |
| max_epochs: 100 | |
| max_steps: -1 # computed at runtime if not set | |
| num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES` | |
| accelerator: gpu | |
| strategy: ddp | |
| accumulate_grad_batches: 1 | |
| enable_checkpointing: False # Provided by exp_manager | |
| logger: false # Provided by exp_manager | |
| log_every_n_steps: 100 # Interval of logging. | |
| val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | |
| check_val_every_n_epoch: 1 # RNNT decoding is slower than CTC, so eval takes longer. Increase value to speed up training slightly. | |
| precision: 32 # RNNT requires a lot of memory, so precision 16 is very important. Use very small batch size for precision 32. | |
| gradient_clip_val: 1.0 # Gradient norm clip value | |
| sync_batchnorm: true | |
| benchmark: false # needs to be false for models with variable-length speech input as it slows down training | |
| exp_manager: | |
| exp_dir: null | |
| name: | |
| create_tensorboard_logger: true | |
| create_checkpoint_callback: true | |
| checkpoint_callback_params: | |
| monitor: "val_wer" | |
| mode: "min" | |
| save_top_k: 3 | |
| always_save_nemo: true | |
| create_wandb_logger: false | |
| wandb_logger_kwargs: | |
| name: null | |
| project: null | |
| entity: null | |
| resume_if_exists: false | |
| resume_ignore_no_checkpoint: false | |