| | --- |
| | tags: |
| | - espnet |
| | - audio |
| | - classification |
| | datasets: |
| | - nsynth |
| | license: cc-by-4.0 |
| | --- |
| | |
| | ## ESPnet2 CLS model |
| |
|
| | ### `espnet/OpenBEATS-Large-NsynthInstrument` |
| |
|
| | This model was trained by Shikhar Bharadwaj using nsynth recipe in [espnet](https://github.com/espnet/espnet/). |
| |
|
| | ## CLS config |
| |
|
| | <details><summary>expand</summary> |
| |
|
| | ``` |
| | config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge2/conf/ear_large/nsynth_instrument.yaml |
| | print_config: false |
| | log_level: INFO |
| | drop_last_iter: false |
| | dry_run: false |
| | iterator_type: sequence |
| | valid_iterator_type: null |
| | output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/nsynth_instrument/cls_earlarge2 |
| | ngpu: 1 |
| | seed: 0 |
| | num_workers: 2 |
| | num_att_plot: 0 |
| | dist_backend: nccl |
| | dist_init_method: env:// |
| | dist_world_size: null |
| | dist_rank: null |
| | local_rank: 0 |
| | dist_master_addr: null |
| | dist_master_port: null |
| | dist_launcher: null |
| | multiprocessing_distributed: false |
| | unused_parameters: true |
| | sharded_ddp: false |
| | use_deepspeed: false |
| | deepspeed_config: null |
| | gradient_as_bucket_view: true |
| | ddp_comm_hook: null |
| | cudnn_enabled: true |
| | cudnn_benchmark: false |
| | cudnn_deterministic: true |
| | use_tf32: false |
| | collect_stats: false |
| | write_collected_feats: false |
| | max_epoch: 30 |
| | patience: null |
| | val_scheduler_criterion: |
| | - valid |
| | - loss |
| | early_stopping_criterion: |
| | - valid |
| | - loss |
| | - min |
| | best_model_criterion: |
| | - - valid |
| | - acc |
| | - max |
| | keep_nbest_models: 1 |
| | nbest_averaging_interval: 0 |
| | grad_clip: 1 |
| | grad_clip_type: 2.0 |
| | grad_noise: false |
| | accum_grad: 1 |
| | no_forward_run: false |
| | resume: true |
| | train_dtype: float32 |
| | use_amp: false |
| | log_interval: null |
| | use_matplotlib: true |
| | use_tensorboard: true |
| | create_graph_in_tensorboard: false |
| | use_wandb: true |
| | wandb_project: audioverse |
| | wandb_id: null |
| | wandb_entity: shikhar |
| | wandb_name: nsynth_instrument.earlarge2 |
| | wandb_model_log_interval: -1 |
| | detect_anomaly: false |
| | use_adapter: false |
| | adapter: lora |
| | save_strategy: all |
| | adapter_conf: {} |
| | pretrain_path: null |
| | init_param: [] |
| | ignore_init_mismatch: false |
| | freeze_param: [] |
| | num_iters_per_epoch: null |
| | batch_size: 32 |
| | valid_batch_size: 16 |
| | batch_bins: 1000000 |
| | valid_batch_bins: null |
| | category_sample_size: 10 |
| | train_shape_file: |
| | - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/nsynth_instrument/cls_stats_16k/train/speech_shape |
| | - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/nsynth_instrument/cls_stats_16k/train/label_shape |
| | valid_shape_file: |
| | - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/nsynth_instrument/cls_stats_16k/valid/speech_shape |
| | - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/nsynth_instrument/cls_stats_16k/valid/label_shape |
| | batch_type: folded |
| | valid_batch_type: null |
| | fold_length: |
| | - 480000 |
| | - 600 |
| | sort_in_batch: descending |
| | shuffle_within_batch: false |
| | sort_batch: descending |
| | multiple_iterator: false |
| | utt2weight_file: null |
| | chunk_length: 500 |
| | chunk_shift_ratio: 0.5 |
| | num_cache_chunks: 1024 |
| | chunk_excluded_key_prefixes: [] |
| | chunk_default_fs: null |
| | chunk_max_abs_length: null |
| | chunk_discard_short_samples: true |
| | train_data_path_and_name_and_type: |
| | - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/nsynth_instrument/train/wav.scp |
| | - speech |
| | - sound |
| | - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/nsynth_instrument/train/text |
| | - label |
| | - text |
| | valid_data_path_and_name_and_type: |
| | - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/nsynth_instrument/valid/wav.scp |
| | - speech |
| | - sound |
| | - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/nsynth_instrument/valid/text |
| | - label |
| | - text |
| | multi_task_dataset: false |
| | allow_variable_data_keys: false |
| | max_cache_size: 0.0 |
| | max_cache_fd: 32 |
| | allow_multi_rates: false |
| | valid_max_cache_size: null |
| | exclude_weight_decay: false |
| | exclude_weight_decay_conf: {} |
| | optim: adamw |
| | optim_conf: |
| | lr: 3.0e-05 |
| | weight_decay: 0.01 |
| | betas: |
| | - 0.9 |
| | - 0.98 |
| | scheduler: cosineannealingwarmuprestarts |
| | scheduler_conf: |
| | first_cycle_steps: 50000 |
| | warmup_steps: 5000 |
| | max_lr: 3.0e-05 |
| | min_lr: 5.0e-06 |
| | lightning_conf: {} |
| | token_list: |
| | - bass |
| | - keyboard |
| | - organ |
| | - mallet |
| | - guitar |
| | - string |
| | - reed |
| | - brass |
| | - vocal |
| | - flute |
| | - synth_lead |
| | - <blank> |
| | - <unk> |
| | text_token_list: null |
| | text_bpemodel: null |
| | init: xavier_normal |
| | input_size: 1 |
| | use_preprocessor: true |
| | frontend: null |
| | frontend_conf: {} |
| | specaug: null |
| | specaug_conf: {} |
| | normalize: null |
| | normalize_conf: {} |
| | preencoder: null |
| | preencoder_conf: {} |
| | encoder: beats |
| | encoder_conf: |
| | beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt |
| | beats_config: |
| | layer_wise_gradient_decay_ratio: 0.3 |
| | encoder_layerdrop: 0.1 |
| | dropout: 0.0 |
| | use_weighted_representation: false |
| | specaug_config: |
| | apply_time_warp: true |
| | apply_freq_mask: false |
| | apply_time_mask: true |
| | time_mask_width_ratio_range: |
| | - 0 |
| | - 0.06 |
| | num_time_mask: 1 |
| | roll_augment: true |
| | roll_interval: 1 |
| | text_encoder: null |
| | text_encoder_conf: {} |
| | embedding_fusion: null |
| | embedding_fusion_conf: {} |
| | decoder: linear |
| | decoder_conf: {} |
| | model: espnet |
| | model_conf: |
| | classification_type: multi-class |
| | lsm_weight: 0.1 |
| | required: |
| | - output_dir |
| | - token_list |
| | version: '202412' |
| | distributed: false |
| | ``` |
| |
|
| | </details> |
| |
|
| | ### Citations |
| |
|
| | ```BibTex |
| | |
| | @article{bharadwaj2025openbeats, |
| | title={OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder}, |
| | author={Bharadwaj, Shikhar and Cornell, Samuele and Choi, Kwanghee and Fukayama, Satoru and Shim, Hye-jin and Deshmukh, Soham and Watanabe, Shinji}, |
| | journal={arXiv preprint arXiv:2507.14129}, |
| | year={2025} |
| | } |
| | |
| | @inproceedings{watanabe2018espnet, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
| | year={2018}, |
| | booktitle={Proceedings of Interspeech}, |
| | pages={2207--2211}, |
| | doi={10.21437/Interspeech.2018-1456}, |
| | url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
| | } |
| | |
| | |
| | |
| | |
| | |
| | |
| | ``` |
| |
|