Change CI from using cspell to codespell (speechbrain#2467)

poonehmousavi · Mar 21, 2024 · 341e35c · 341e35c
1 parent 35ef78c
commit 341e35c
Show file tree

Hide file tree

Showing 115 changed files with 208 additions and 235 deletions.
diff --git a/.dict-speechbrain.txt b/.dict-speechbrain.txt
@@ -152,6 +152,7 @@ vWeight
 wDay
 wGap
 whamr
+whats
 Wmax
 xAxis
 xHat
@@ -191,6 +192,7 @@ aiox
 alffa
 alived
 annot
+ans
 arpa
 arpack
 arxiv
@@ -212,6 +214,7 @@ bmlh
 brir
 cafile
 cand
+cant
 catl
 catr
 cbak
@@ -222,6 +225,7 @@ cfgs
 chans
 cheby
 cheybyshev
+childs
 chkarada
 chnl
 chnls
@@ -247,6 +251,7 @@ convolutional
 convs
 convtasnet
 convtranspose
+couldnt
 covl
 covls
 cpulm
@@ -307,6 +312,7 @@ Diarizing
 dictify
 diDataset
 didatasets
+didnt
 disambig
 discretized
 discretizes
@@ -356,8 +362,11 @@ finv
 fitzooth
 flac
 fltp
+fo
 foos
+fpr
 freqs
+fro
 fromx
 fsa's
 fsas
@@ -517,6 +526,7 @@ nbest
 nbin
 nccl
 ncor
+nd
 ndarray
 ndim
 ndims
@@ -555,14 +565,17 @@ numspks
 nvvm
 nwerr
 NYU's
+oclock
 oemax
+oen
 Omniglot
 onnxruntime
 onwsj
 openfst
 openrir
 optim
 osama
+ot
 ovrl
 paedophiles
 parametrizations
@@ -678,6 +691,7 @@ sents
 septillionths
 seqlm
 seqs
+ser
 sers
 sess
 setu
@@ -699,6 +713,7 @@ snrlevels
 snrs
 snts
 soxi
+specif
 spectr
 sphs
 spkid
@@ -707,6 +722,7 @@ spkrdata
 spkrec
 spkrs
 spks
+splitted
 srate
 srmr
 srmrpy
@@ -741,6 +757,8 @@ tdoa
 tdoas
 tdur
 texthvc
+thats
+theyre
 tids
 timit
 tjoint
@@ -825,6 +843,7 @@ xlsr
 xmls
 xponent
 yamls
+youre
 ӿéæœâçèàûî
 
 ####### Names #######
@@ -1159,6 +1178,8 @@ utilises
 visualisation
 
 ####### Non-English #######
+AUJOURD
+AUJOURD'HUI
 collés
 delle
 encadre
@@ -1167,5 +1188,4 @@ Politecnica
 quelques
 Université
 Università
-AUJOURD'HUI
-AUJOURD
+vie
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,7 +30,10 @@ repos:
  hooks:
  - id: yamllint
 
- - repo: https://github.com/streetsidesoftware/cspell-cli
- rev: v8.6.0
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.2.4
  hooks:
- - id: cspell
+ - id: codespell
+ args: [--ignore-words=.dict-speechbrain.txt]
+ additional_dependencies:
+ - tomli
diff --git a/cspell.json b/cspell.json
diff --git a/docs/conf.py b/docs/conf.py
@@ -93,7 +93,7 @@
 
 
 def run_apidoc(app):
- """Generage API documentation"""
+ """Generate API documentation"""
  import better_apidoc
 
  better_apidoc.APP = app

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -113,7 +113,7 @@ and where the implemented algorithm needs clarification.
 - Automatically run cspell
 - NOTE: If the hooks fix something (e.g. trailing whitespace or reformat with black), these changes are not automatically added and committed. You’ll have to add the fixed files again and run the commit again. I guess this is a safeguard: don’t blindly accept changes from git hooks.
 - NOTE2: The hooks are only run on the files you git added to the commit. This is in contrast to the CI pipeline, which always tests everything.
-- NOTE3: If a word is not present in the dictionary, you can either add the word to `.dict-speechbrain.txt` or you can ignore the word by adding a comment `# cspell: ignore <word>`
+- NOTE3: If a word is flagged as a spelling error but it should be kept, you can add the word to `.dict-speechbrain.txt`
 
 ### the git pre-push hooks
 - Black and flake8 as checks on the whole repo

diff --git a/docs/guidance.md b/docs/guidance.md
@@ -207,4 +207,4 @@ The other files in this folder provide further guidance on where is what configu
 Keep in mind, the SpeechBrain community is in-flux, so is a constellation of maintainers and reviewers nothing more but a snapshot.
 
 _Note: github workflows take the definition of a PR, what is specified within its branch. We might update our procedures on the `develop` branch (e.g., to meet dependency updates).
-Consequentially, PR and `unstable` branches need to fetch from latest `develop` when testing related definitions are updated._
+Consequently, PR and `unstable` branches need to fetch from latest `develop` when testing related definitions are updated._
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,3 +13,6 @@ exclude = '''
  )/
 )
 '''
+
+[tool.codespell]
+skip = "./tests/tmp,./**/result,*.csv,*train.txt,*test.txt"
diff --git a/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml b/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml
@@ -19,7 +19,7 @@ train_log: !ref <output_folder>/train_log.txt
 # Data files
 NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
 data_folder: !PLACEHOLDER # e,g./path/to/aishell
-data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augmentation will automatically be downloaded here.
+data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
 skip_prep: False
 remove_compressed_wavs: False
 ckpt_interval_minutes: 15 # save checkpoint every N min

diff --git a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml
@@ -20,7 +20,7 @@ NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.z
 
 # Data files
 data_folder: !PLACEHOLDER # e,g./path/to/aishell
-data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augmentation will automatically be downloaded here.
+data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
 skip_prep: False
 remove_compressed_wavs: False
 ckpt_interval_minutes: 15 # save checkpoint every N min

diff --git a/recipes/AISHELL-1/Tokenizer/train.py b/recipes/AISHELL-1/Tokenizer/train.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env/python3
 """Recipe for training a BPE tokenizer with AISHELL-1.
-The tokenizer coverts transcripts into sub-word units that can
+The tokenizer converts transcripts into sub-word units that can
 be used to train a language (LM) or an acoustic model (AM).
 
 To run this recipe, do the following:

diff --git a/recipes/AMI/Diarization/experiment.py b/recipes/AMI/Diarization/experiment.py
@@ -267,7 +267,7 @@ def diarize_dataset(full_meta, split_type, n_lambdas, pval, n_neighbors=10):
  num_spkrs = diar.get_oracle_num_spkrs(rec_id, spkr_info)
  else:
  if params["affinity"] == "nn":
- # Num of speakers tunned on dev set (only for nn affinity).
+ # Num of speakers tuned on dev set (only for nn affinity).
  num_spkrs = n_lambdas
  else:
  # Num of speakers will be estimated using max eigen gap for cos based affinity.
@@ -348,7 +348,7 @@ def dev_pval_tuner(full_meta, split_type):
  # p_val is needed in oracle_n_spkr=False when using kmeans backend.
  break
 
- # Take p_val that gave minmum DER on Dev dataset.
+ # Take p_val that gave minimum DER on Dev dataset.
  tuned_p_val = prange[DER_list.index(min(DER_list))]
 
  return tuned_p_val
@@ -383,7 +383,7 @@ def dev_ahc_threshold_tuner(full_meta, split_type):
  if params["oracle_n_spkrs"] is True:
  break # no need of threshold search.
 
- # Take p_val that gave minmum DER on Dev dataset.
+ # Take p_val that gave minimum DER on Dev dataset.
  tuned_p_val = prange[DER_list.index(min(DER_list))]
 
  return tuned_p_val
@@ -429,7 +429,7 @@ def dev_nn_tuner(full_meta, split_type):
 
 def dev_tuner(full_meta, split_type):
  """Tuning n_components on dev set. Used for nn based affinity matrix.
- Note: This is a very basic tunning for nn based affinity.
+ Note: This is a very basic tuning for nn based affinity.
  This is work in progress till we find a better way.
  """
 
@@ -453,7 +453,7 @@ def dev_tuner(full_meta, split_type):
 
  DER_list.append(DER_)
 
- # Take n_lambdas with minmum DER.
+ # Take n_lambdas with minimum DER.
  tuned_n_lambdas = DER_list.index(min(DER_list)) + 1
 
  return tuned_n_lambdas

diff --git a/recipes/Aishell1Mix/separation/train.py b/recipes/Aishell1Mix/separation/train.py
@@ -307,7 +307,7 @@ def add_speed_perturb(self, targets, targ_lens):
  return mix, targets
 
  def cut_signals(self, mixture, targets):
- """This function selects a random segment of a given length withing the mixture.
+ """This function selects a random segment of a given length within the mixture.
  The corresponding targets are selected accordingly"""
  randstart = torch.randint(
  0,
@@ -430,7 +430,7 @@ def save_results(self, test_data):
  def save_audio(self, snt_id, mixture, targets, predictions):
  "saves the test audio (mixture, targets, and estimated sources) on disk"
 
- # Create outout folder
+ # Create output folder
  save_path = os.path.join(self.hparams.save_folder, "audio_results")
  if not os.path.exists(save_path):
  os.mkdir(save_path)
@@ -639,7 +639,7 @@ def audio_pipeline_noise(noise_wav):
  os.path.normpath(hparams["base_folder_dm"]) + "_processed"
  )
 
- # Colleting the hparams for dynamic batching
+ # Collecting the hparams for dynamic batching
  dm_hparams = {
  "train_data": hparams["train_data"],
  "data_folder": hparams["data_folder_nspks"],

diff --git a/recipes/AudioMNIST/audiomnist_prepare.py b/recipes/AudioMNIST/audiomnist_prepare.py
@@ -459,7 +459,7 @@ def convert_speaker_meta_values(speaker_meta):
 
 
 def convert_speaker_meta(speaker_meta):
- """Converts speaker metdata to the target format
+ """Converts speaker metadata to the target format
 
  Arguments
  ---------

diff --git a/recipes/AudioMNIST/diffusion/README.md b/recipes/AudioMNIST/diffusion/README.md
@@ -1,5 +1,5 @@
 # Denoising Diffusion Probabilistic Model
-This folder contrains scripts for running a Denoising Diffusion Probabilistic Model
+This folder contains scripts for running a Denoising Diffusion Probabilistic Model
 generative model with the [AudioMNIST](https://huggingface.co/datasets/flexthink/audiomnist) dataset, which contains recordings
 of spoken English digits in a variety of voices and accents.
 

diff --git a/recipes/AudioMNIST/diffusion/train.py b/recipes/AudioMNIST/diffusion/train.py
@@ -911,7 +911,7 @@ def save_spectrograms(self, samples, path, folder="spec", labels=None):
  samples: torch.Tensor
  a tensor of sample spectrograms
  path: str
- ths path to samples for a given epoch
+ the path to samples for a given epoch
  folder: str
  the name of the folder where the spectrograms
  will be saved
@@ -1016,7 +1016,7 @@ def save_audio(self, wav, path, folder="wav", labels=None):
  the destination directory
 
  folder: str
- the subfolder within the destinatin directory
+ the subfolder within the destination directory
 
  labels: list
  a list of labels, for each sample. If omitted,

diff --git a/recipes/BinauralWSJ0Mix/prepare_data.py b/recipes/BinauralWSJ0Mix/prepare_data.py
@@ -1,5 +1,5 @@
 """
-The .csv preperation functions for Binaural-WSJ0Mix.
+The .csv preparation functions for Binaural-WSJ0Mix.
 
 Author
  * Cem Subakan 2020

diff --git a/recipes/BinauralWSJ0Mix/separation/train.py b/recipes/BinauralWSJ0Mix/separation/train.py
@@ -562,7 +562,7 @@ def save_results(self, test_data):
  def save_audio(self, snt_id, mixture, targets, predictions):
  "saves the test audio (mixture, targets, and estimated sources) on disk"
 
- # Create outout folder
+ # Create output folder
  save_path = os.path.join(self.hparams.save_folder, "audio_results")
  if not os.path.exists(save_path):
  os.mkdir(save_path)

diff --git a/recipes/CommonLanguage/README.md b/recipes/CommonLanguage/README.md
@@ -58,7 +58,7 @@ This dataset is composed of speakers of 45 languages that were carefully selecte
 * Tamil
 * Tatar
 * Turkish
-* Ukranian
+* Ukrainian
 * Welsh
 
 ## Other information