Просмотр исходного кода

Fix speaker for folder dataset

Lengyue 1 год назад
Родитель
Сommit
7b4e1fd10a
1 измененных файлов с 4 добавлено и 3 удалено
  1. 4 3
      tools/llama/build_dataset.py

+ 4 - 3
tools/llama/build_dataset.py

@@ -27,6 +27,7 @@ def task_generator_folder(root: Path, text_extension: str):
     grouped_files = defaultdict(list)
     for file in tqdm(files, desc=f"Grouping {root}"):
         p = str(file.parent)
+        speaker = file.parent.name
 
         try:
             if isinstance(text_extension, str):
@@ -37,13 +38,13 @@ def task_generator_folder(root: Path, text_extension: str):
             logger.error(f"Failed to read text {file}: {e}")
             continue
 
-        grouped_files[p].append((file, texts))
+        grouped_files[p].append((speaker, file, texts))
 
     logger.info(
         f"Found {len(grouped_files)} groups in {root}, {list(grouped_files.keys())[:5]}..."
     )
-    for name, subset in grouped_files.items():
-        yield name, subset, "folder"
+    for speaker, file, texts in grouped_files.values():
+        yield speaker, (file, texts), "folder"
 
 
 def task_generator_filelist(filelist):