flashlight · tpolasek · Oct 11, 2024
diff --git a/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py b/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py
@@ -46,7 +46,6 @@ def to_wav2letterFormat(data: torch.tensor, sr: int) -> torch.tensor:
 
 
 def get_base_data_from_csv(pathTSV) -> List[Dict[str, str]]:
-
     out = []
     with open(pathTSV, "r", encoding="utf-8") as tsvfile:
         reader = csv.DictReader(tsvfile, dialect="excel-tab")
@@ -64,7 +63,6 @@ def norm_text(
     replace_set: Optional[Dict[str, str]] = None,
     del_set: Optional[Set[str]] = None,
 ) -> Tuple[bool, str]:
-
     text = text.lower()
     if replace_set is not None:
         for char_, val in replace_set.items():
@@ -98,7 +96,6 @@ def get_full_audio_data(
     del_set: Optional[Set[str]] = None,
     file_extension: str = None,
 ) -> List[FileInfo]:
-
     output = []
     for audio_data in tqdm(base_data, total=len(base_data)):
         path_audio = path_dir_audio / audio_data["local_path"]
@@ -130,7 +127,6 @@ def get_full_audio_data(
 def convert_audio_data(
     input_list: List[FileInfo], out_dir_audio: Path
 ) -> List[FileInfo]:
-
     out_dir_audio.mkdir(exist_ok=True)
     output = []
     for file_info in tqdm(input_list, total=len(input_list)):
@@ -153,13 +149,11 @@ def convert_audio_data(
 
 
 def load_filter(path_filter: Path) -> List[str]:
-
     with open(path_filter, "r") as f:
         return [x.strip() for x in f.readlines()]
 
 
 def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]):
-
     input_lst.sort(key=lambda x: x.id_)
     to_filter.sort()
 
@@ -183,7 +177,6 @@ def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]):
 
 
 def main(args):
-
     letters = load_letters(Path(args.path_tokens))
     data = get_base_data_from_csv(Path(args.path_tsv))
     audio_data = get_full_audio_data(
@@ -207,7 +200,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(
         description="Build the lst input files for common voices datasets"
     )

diff --git a/recipes/joint_training_vox_populi/prepare_data/get_tokens.py b/recipes/joint_training_vox_populi/prepare_data/get_tokens.py
@@ -18,7 +18,6 @@ def get_tokens_from_str(str_in) -> Set[str]:
 
 
 def get_tokens_from_str_list(list_str: List[str]) -> Set[str]:
-
     out = set()
     for str_in in list_str:
         out = out.union(get_tokens_from_str(str_in))
@@ -27,15 +26,13 @@ def get_tokens_from_str_list(list_str: List[str]) -> Set[str]:
 
 
 def save_tokens(tokens, path_out, eow_token="|") -> None:
-
     with open(path_out, "w") as f:
         for x in tokens:
             f.write(x + "\n")
         f.write(eow_token)
 
 
 def main(args):
-
     data = get_base_data_from_csv(args.input_csv)
     all_tokens = get_tokens_from_str_list([x["text"] for x in data])
 
@@ -48,7 +45,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser("Token builder")
     parser.add_argument("input_csv")
     parser.add_argument("output")

diff --git a/recipes/joint_training_vox_populi/prepare_data/lst_utils.py b/recipes/joint_training_vox_populi/prepare_data/lst_utils.py
@@ -19,14 +19,12 @@ class FileInfo:
 
 
 def save_lst(lst_data: List[FileInfo], path_out: Path) -> None:
-
     with open(path_out, "w") as file:
         for data in lst_data:
             file.write(f"{data.id_} {data.path_} {data.size*3600 * 1000} {data.text}\n")
 
 
 def load_lst(path_file: Path) -> List[FileInfo]:
-
     with open(path_file, "r") as file:
         data = [x.strip() for x in file.readlines()]
 

diff --git a/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py b/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py
@@ -19,7 +19,6 @@ def has_valid_tokens(word: str, tokens: Set[str]) -> bool:
 
 
 def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]:
-
     with path_token_file.open("r") as file:
         data = [x.strip() for x in file.readlines()]
 
@@ -29,7 +28,6 @@ def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]:
 def save_lexicon(
     lexicon: Set[str], path_out: Path, eow_char: str, tokens: Set[str]
 ) -> None:
-
     list_lexicon = list(lexicon)
     list_lexicon.sort()
 
@@ -98,7 +96,6 @@ def lexicon_from_lst(
     min_occ: int = 10,
     is_raw_text: bool = False,
 ) -> None:
-
     out_lexicon = set()
     tokens = read_token_file(path_tokens, eow_char)
     log.info("Token file loaded")

diff --git a/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py b/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py
@@ -102,8 +102,9 @@ def compute_ppl_upper_limit_char_convlm(
     print("Upper word perplexity for all words: {}".format(ppl_word))
     print("Upper word perplexity for unknown words: {}".format(ppl_word_unk))
     print(
-        "(Reported in the paper) "
-        "Upper word perplexity for known words: {}".format(ppl_word_no_unk)
+        "(Reported in the paper) " "Upper word perplexity for known words: {}".format(
+            ppl_word_no_unk
+        )
     )
 
 
@@ -142,8 +143,9 @@ def compute_ppl_upper_limit_word_convlm(model, input_wordlm):
     print("Word perplexity for all words: {}".format(ppl_word))
     print("Word perplexity for unknown words: {}".format(ppl_word_unk))
     print(
-        "(Reported in the paper) "
-        "Word perplexity for known words: {}".format(ppl_word_no_unk)
+        "(Reported in the paper) " "Word perplexity for known words: {}".format(
+            ppl_word_no_unk
+        )
     )