NVIDIA · titu1994 · Jun 7, 2024 · Dec 8, 2023 · Dec 8, 2023 · Dec 11, 2023
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_cross_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_cross_llama_lhotse.yaml
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_llama_lhotse.yaml
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_t5_config.yaml b/examples/multimodal/speech_llm/conf/modular_audio_t5_config.yaml
@@ -0,0 +1,338 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_t5_salm_lhotse
+
+trainer:
+ devices: 1
+ accelerator: gpu
+ num_nodes: 1
+ precision: bf16
+ logger: False # logger provided by exp_manager
+ enable_checkpointing: False
+ use_distributed_sampler: False
+ max_epochs: 9999
+ max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+ limit_train_batches : 1000
+ log_every_n_steps: 10 # frequency with which training steps are logged 
+ val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+ gradient_clip_val: 1.0
+ accumulate_grad_batches: 1
+
+model_target: nemo.collections.multimodal.speech_llm.models.modular_models_t5.ModularizedAudioT5Model
+exp_manager:
+ # explicit_log_dir: null
+ exp_dir: null
+ name: ${name}
+ create_wandb_logger: False
+ wandb_logger_kwargs:
+ project: null
+ name: null
+ resume_if_exists: True
+ resume_ignore_no_checkpoint: True
+ create_checkpoint_callback: True
+ checkpoint_callback_params:
+ monitor: validation_${model.data.validation_ds.metric.name}
+ save_top_k: 1
+ mode: min
+ save_nemo_on_train_end: True
+ filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+ model_parallel_size: ${model.tensor_model_parallel_size}
+ always_save_nemo: False
+ save_best_model: True
+ create_early_stopping_callback: False
+ early_stopping_callback_params:
+ monitor: "val_loss"
+ mode: "min"
+ min_delta: 0.001
+ patience: 10
+ verbose: True
+ strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+ virtual_prompt_style: 'no-prompts' # make cls happy
+ seed: 1234
+ tensor_model_parallel_size: 1 # intra-layer model parallelism
+ pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+ pretrained_audio_model: stt_en_fastconformer_transducer_large
+ freeze_llm: True
+ freeze_audio_encoder: False
+ freeze_modality_adapter: False
+ load_audio_encoder: True
+
+ global_batch_size: 128
+ micro_batch_size: 4
+ language_model_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+ resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+ save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+ sync_batch_comm: False
+ megatron_amp_O2: False
+
+ ## Sequence Parallelism
+ # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+ # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+ sequence_parallel: False
+
+ ## Activation Checkpoint 
+ activations_checkpoint_granularity: null # 'selective' or 'full' 
+ activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+ # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+ # of each chunk at the specified granularity
+ # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+ activations_checkpoint_num_layers: null # not used with 'selective'
+ activations_checkpoint_layers_per_pipeline: null
+ answer_only_loss: True
+ gradient_as_bucket_view: False
+
+ hidden_dropout: 0.0
+ attention_dropout: 0.0
+ ffn_dropout: 0.0
+
+ # use_am_tokenizer: True
+ # override_vocab_size: 1024
+
+ lora_tuning:
+ kqv_adapter_dim: 128
+ kv_adapter_dim: 64
+ q_adapter_dim: 32
+ adapter_dropout: 0.0
+ column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+ row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+
+ peft:
+ peft_scheme: "adapter" # can be either adapter,ia3, or ptuning
+ restore_from_path: null
+
+ # Used for adapter peft training
+ adapter_tuning:
+ type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+ adapter_dim: 32
+ adapter_dropout: 0.0
+ norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+ column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+ row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+ norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm']
+
+ # Used for p-tuning peft training
+ p_tuning:
+ virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence
+ bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck
+ embedding_dim: 1024 # the size of the prompt encoder embeddings
+ init_std: 0.023
+
+ perception:
+ target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+ use_multi_layer_feat: false
+
+ modality_adapter: 
+ _target_: nemo.collections.asr.modules.ConformerEncoder
+ feat_in: 1024
+ feat_out: -1 # you may set it if you need different output size other than the default d_model
+ n_layers: 2
+ d_model: 512
+
+ # Sub-sampling parameters
+ subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+ subsampling_factor: 8 # must be power of 2 for striding and vggnet
+ subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+ causal_downsampling: false
+
+ # Reduction parameters: Can be used to add another subsampling layer at a given position.
+ # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+ # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+ reduction: null # pooling, striding, or null
+ reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+ reduction_factor: 1
+
+ # Feed forward module's params
+ ff_expansion_factor: 4
+
+ # Multi-headed Attention Module's params
+ self_attention_model: rel_pos # rel_pos or abs_pos
+ n_heads: 8 # may need to be lower for smaller d_models
+ # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+ att_context_size: [-1, -1] # -1 means unlimited context
+ att_context_style: regular # regular or chunked_limited
+ xscaling: true # scales up the input embeddings by sqrt(d_model)
+ untie_biases: true # unties the biases of the TransformerXL layers
+ pos_emb_max_len: 5000
+
+ # Convolution module's params
+ conv_kernel_size: 9
+ conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+ # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+ # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+ conv_context_size: null
+
+ ### regularization
+ dropout: 0.1 # The dropout used in most of the Conformer Modules
+ dropout_pre_encoder: 0.1 # The dropout used before the encoder
+ dropout_emb: 0.0 # The dropout used for embeddings
+ dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+ # set to non-zero to enable stochastic depth
+ stochastic_depth_drop_prob: 0.0
+ stochastic_depth_mode: linear # linear or uniform
+ stochastic_depth_start_layer: 1
+
+ spec_augment:
+ _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+ freq_masks: 2 # set to zero to disable it
+ time_masks: 10 # set to zero to disable it
+ freq_width: 27
+ time_width: 0.05
+
+ # the following are read from the pretrained AM:
+ # output_dim: null
+ # encoder: null
+ # preprocessor: null
+
+ data:
+ train_ds:
+ # Example of how to specify paths to multiple datasets
+ # manifest_filepath:
+ # - /path/to/squad.jsonl
+ # - /path/to/mnli.jsonl
+ # - /path/to/boolq.jsonl
+ # Example of how each dataset is formatted
+ # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+ # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+ manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+ global_batch_size: ${model.global_batch_size}
+ micro_batch_size: ${model.micro_batch_size}
+ shuffle: True
+ num_workers: 0
+ pin_memory: True
+ max_seq_length: 2048
+ min_seq_length: 1
+ drop_last: True
+ # Notably, the data weights are controlled by either bucketing_weights
+ # or concat_sampling_probabilities depending on the dataset type (tar and
+ # non-tar).
+ # See audio_text_qa_dataset.py for details.
+ concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+ context_key: 'input'
+ label_key: 'output'
+ add_eos: True
+ # add_eos: False
+ add_sep: True
+ add_bos: False
+ separate_prompt_and_response_with_newline: False
+ truncation_field: "context" # Options: ['context', 'answer']
+ index_mapping_dir: null # Path to a directory to write index mapping files.
+ prompt_template: "Q: {input}\nA: {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+ # ASR configs
+ sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+ max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+ min_duration: 0.1
+ # tarred datasets
+ is_tarred: false
+ tarred_audio_filepaths: null
+ shuffle_n: 2048
+ # bucketing params
+ bucketing_strategy: "fully_randomized"
+ bucketing_batch_size: null
+ # sample_alpha: 0.1
+ use_lhotse: True
+ duration_bins : [2,4,6,8,10,12,14,16,18]
+ lhotse:
+ text_field : "text"
+ batch_duration : 80 # 0
+ quadratic_duration : 30
+ max_open_streams: 50
+ num_buckets : 30
+ buffer_size : 10000
+ shuffle_buffer_size : 10000
+ duration_bins: [2.92,3.474,3.924,4.335,4.728,5.11,5.487,5.872,6.288,6.696,7.128,7.62,8.208,8.934,9.883,10.56,11.22,11.88,12.51,13.05,13.59,14.13,14.64,15.17875,15.81,16.54,17.37,18.241,19.18]
+ # sample_alpha: 0.1
+
+ validation_ds:
+ manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+ global_batch_size: ${model.global_batch_size}
+ micro_batch_size: ${model.micro_batch_size}
+ shuffle: False
+ num_workers: 0
+ pin_memory: True
+ max_seq_length: 2048
+ min_seq_length: 1
+ drop_last: False
+ context_key: ${model.data.train_ds.context_key}
+ label_key: ${model.data.train_ds.label_key}
+ add_eos: ${model.data.train_ds.add_eos}
+ add_sep: ${model.data.train_ds.add_sep}
+ add_bos: ${model.data.train_ds.add_bos}
+ separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+ write_predictions_to_file: False
+ output_file_path_prefix: null # Prefix of the file to write predictions to.
+ truncation_field: "context" # Options: ['context', 'answer']
+ index_mapping_dir: null # Path to a directory to write index mapping files.
+ prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+ tokens_to_generate: 128
+ # ASR configs
+ sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+ log_every_n_steps: 1
+ metric:
+ name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+ average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+ num_classes: null
+
+ # make model init happy
+ num_workers: 0
+ # test_ds:
+ # manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+ # names: null # Names of the corresponding datasets used to log metrics.
+ # global_batch_size: ${model.global_batch_size}
+ # micro_batch_size: ${model.micro_batch_size}
+ # shuffle: False
+ # num_workers: 4
+ # pin_memory: True
+ # max_seq_length: 2048
+ # min_seq_length: 1
+ # drop_last: False
+ # context_key: 'input'
+ # label_key: 'output'
+ # add_eos: ${model.data.train_ds.add_eos}
+ # add_sep: ${model.data.train_ds.add_sep}
+ # add_bos: ${model.data.train_ds.add_bos}
+ # separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+ # write_predictions_to_file: False
+ # output_file_path_prefix: null # Prefix of the file to write predictions to.
+ # truncation_field: "context" # Options: ['context', 'answer']
+ # index_mapping_dir: null # Path to a directory to write index mapping files.
+ # prompt_template: ${model.data.train_ds.prompt_template}
+ # # ASR configs
+ # sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+ # metric:
+ # name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+ # average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+ # num_classes: null
+
+ optim:
+ name: fused_adam
+ lr: 1e-4
+ weight_decay: 0.01 
+ betas: 
+ - 0.9
+ - 0.98
+ sched:
+ name: CosineAnnealing
+ warmup_steps: 50
+ min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+ constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+ monitor: val_loss
+ reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_train.py b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
@@ -18,7 +18,7 @@
 from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.core.config import hydra_runner
-from nemo.utils import logging
+from nemo.utils import logging, model_utils
 from nemo.utils.exp_manager import exp_manager
 
 mp.set_start_method("spawn", force=True)
@@ -61,7 +61,11 @@ def main(cfg) -> None:
  # update resume from checkpoint found by exp_manager
  logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 
- model = ModularAudioGPTModel.restore_from_pretrained_models(cfg, trainer=trainer)
+ if hasattr(cfg, 'model_target'):
+ imported_cls = model_utils.import_class_by_path(cfg.model_target)
+ else:
+ imported_cls = ModularAudioGPTModel
+ model = imported_cls.restore_from_pretrained_models(cfg, trainer=trainer)
 
  trainer.fit(model)