%!s(int64=2) %!d(string=hai) anos · cb7a17dd0c
--- a/tools/wenet_clean/clean_wenet_speech.py
+++ b/tools/wenet_clean/clean_wenet_speech.py
@@ -1,119 +0,0 @@
 
				-import json
			
 
				-import os
			
 
				-import subprocess
			
 
				-import tempfile
			
 
				-import time
			
 
				-from pathlib import Path
			
 
				-
			
 
				-import librosa
			
 
				-import soundfile as sf
			
 
				-import torch
			
 
				-import torchaudio
			
 
				-from fish_audio_preprocess.utils.separate_audio import (
			
 
				-    init_model,
			
 
				-    merge_tracks,
			
 
				-    separate_audio,
			
 
				-)
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-rank = int(os.environ.get("SLURM_PROCID", 0))
			
 
				-world_size = int(os.environ.get("SLURM_NTASKS", 1))
			
 
				-device = torch.device("cuda:0")
			
 
				-print(f"Rank {rank}/{world_size} on {device}")
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    meta_path = Path("dataset/tts/WenetSpeech/WenetSpeech.json")
			
 
				-    dataset_path = Path("dataset/tts/WenetSpeech")
			
 
				-    cleaned_path = Path("dataset/tts/WenetSpeech/cleaned")
			
 
				-    if not cleaned_path.exists():
			
 
				-        cleaned_path.mkdir(parents=True)
			
 
				-
			
 
				-    demucs = init_model("htdemucs", device)
			
 
				-    print("Model loaded")
			
 
				-
			
 
				-    with open(meta_path) as f:
			
 
				-        dataset = json.load(f)["audios"]
			
 
				-
			
 
				-    print(f"Dataset loaded, {len(dataset)} samples")
			
 
				-    dataset = dataset[rank::world_size]
			
 
				-    print(f"Dataset split, {len(dataset)} samples")
			
 
				-
			
 
				-    for data_idx, data in enumerate(dataset):
			
 
				-        done_path = cleaned_path / data["aid"] / "done"
			
 
				-        done_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-        if done_path.exists():
			
 
				-            continue
			
 
				-
			
 
				-        print(f"Processing {data_idx}/{len(dataset)} at rank {rank}")
			
 
				-
			
 
				-        try:
			
 
				-            with tempfile.NamedTemporaryFile(suffix=".wav") as f:
			
 
				-                subprocess.check_call(
			
 
				-                    [
			
 
				-                        "ffmpeg",
			
 
				-                        "-y",
			
 
				-                        "-i",
			
 
				-                        str(dataset_path / data["path"]),
			
 
				-                        "-c:a",
			
 
				-                        "pcm_s16le",
			
 
				-                        "-threads",
			
 
				-                        "0",
			
 
				-                        "-ar",
			
 
				-                        "24000",
			
 
				-                        str(f.name),
			
 
				-                    ],
			
 
				-                    stdout=subprocess.DEVNULL,
			
 
				-                    stderr=subprocess.DEVNULL,
			
 
				-                )
			
 
				-                raw_audio, sr = librosa.load(f.name, sr=None, mono=True)
			
 
				-
			
 
				-            raw_audio = torch.from_numpy(raw_audio[None]).to(device)
			
 
				-            audio = torchaudio.functional.resample(
			
 
				-                raw_audio, orig_freq=sr, new_freq=demucs.samplerate
			
 
				-            )
			
 
				-            # Make it 2 channels
			
 
				-            audio = torch.cat([audio, audio], dim=0)
			
 
				-            tracks = separate_audio(
			
 
				-                demucs, audio, shifts=1, num_workers=0, progress=False
			
 
				-            )
			
 
				-            audio = merge_tracks(tracks, filter=["vocals"])[0]
			
 
				-            vocals, sr = (
			
 
				-                torchaudio.functional.resample(
			
 
				-                    audio, orig_freq=demucs.samplerate, new_freq=24000
			
 
				-                ),
			
 
				-                24000,
			
 
				-            )
			
 
				-            vocals = vocals.cpu().numpy()
			
 
				-
			
 
				-            for idx, segment in enumerate(data["segments"]):
			
 
				-                if segment["confidence"] <= 0.95:
			
 
				-                    continue
			
 
				-
			
 
				-                # Load audio
			
 
				-                begin = int(segment["begin_time"] * sr)
			
 
				-                end = int(segment["end_time"] * sr)
			
 
				-                segment_audio = vocals[begin:end]
			
 
				-
			
 
				-                # Write audio
			
 
				-                temp_path = cleaned_path / data["aid"] / f"S{idx:05d}.wav"
			
 
				-                temp_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-                sf.write(temp_path, segment_audio, samplerate=sr)
			
 
				-
			
 
				-                # Write text
			
 
				-                temp_path = temp_path.with_suffix(".txt")
			
 
				-                temp_path.write_text(segment["text"])
			
 
				-
			
 
				-            # Write done file
			
 
				-            done_path.write_text("")
			
 
				-        except Exception as e:
			
 
				-            print(f"Error {e} on {data_idx}/{len(dataset)} at rank {rank}")
			
 
				-            time.sleep(10)
			
 
				-            continue
			
 
				-
			
 
				-    print("Done")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/tools/wenet_clean/compress_tar.py
+++ b/tools/wenet_clean/compress_tar.py
@@ -1,88 +0,0 @@
 
				-import io
			
 
				-import random
			
 
				-import tarfile
			
 
				-from multiprocessing import Process
			
 
				-from pathlib import Path
			
 
				-
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-
			
 
				-def chunked_tarring(rank, file_list, base_folder, output_folder, chunk_size=1024**3):
			
 
				-    chunk_count = 1
			
 
				-    total_size = 0
			
 
				-    saved_count = 0
			
 
				-
			
 
				-    buffer = io.BytesIO()
			
 
				-    tar = tarfile.open(fileobj=buffer, mode="w")
			
 
				-
			
 
				-    for audio_file in file_list:
			
 
				-        txt_file = audio_file.with_suffix(".txt")
			
 
				-        if not txt_file.exists():
			
 
				-            continue
			
 
				-
			
 
				-        file_size = audio_file.stat().st_size + txt_file.stat().st_size
			
 
				-        if total_size + file_size > chunk_size:
			
 
				-            tar.close()
			
 
				-
			
 
				-            # write the buffer to disk
			
 
				-            buffer.seek(0)
			
 
				-            with open(
			
 
				-                output_folder / f"chunk-{rank:03d}-{chunk_count:04d}.tar", "wb"
			
 
				-            ) as f:
			
 
				-                f.write(buffer.read())
			
 
				-
			
 
				-            chunk_count += 1
			
 
				-            total_size = 0
			
 
				-
			
 
				-            buffer.close()
			
 
				-            buffer = io.BytesIO()
			
 
				-            tar = tarfile.open(fileobj=buffer, mode="w")
			
 
				-
			
 
				-        tar.add(audio_file, arcname=audio_file.relative_to(base_folder))
			
 
				-        tar.add(txt_file, arcname=txt_file.relative_to(base_folder))
			
 
				-
			
 
				-        total_size += file_size
			
 
				-
			
 
				-        if saved_count % 1000 == 0:
			
 
				-            print(f"Rank {rank}: {saved_count}/{len(file_list)}")
			
 
				-
			
 
				-        saved_count += 1
			
 
				-
			
 
				-    tar.close()
			
 
				-    buffer.seek(0)
			
 
				-    with open(output_folder / f"chunk-{rank:03d}-{chunk_count:04d}.tar", "wb") as f:
			
 
				-        f.write(buffer.read())
			
 
				-
			
 
				-    print(f"Rank {rank}: {saved_count}/{len(file_list)}")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    base_folder = Path("/mnt/nvme1/multi-modal-test/WenetSpeech/cleaned")
			
 
				-    output_folder = Path("/mnt/nvme1/multi-modal-test/WenetSpeech/compressed")
			
 
				-    output_folder.mkdir(exist_ok=True, parents=True)
			
 
				-    num_workers = 50
			
 
				-
			
 
				-    file_list = list(tqdm(base_folder.rglob("*.flac")))
			
 
				-    random.shuffle(file_list)
			
 
				-    print(f"Total files: {len(file_list)}")
			
 
				-
			
 
				-    chunk_size = len(file_list) // num_workers
			
 
				-    processes = []
			
 
				-
			
 
				-    for i in range(num_workers):
			
 
				-        start = i * chunk_size
			
 
				-        end = (i + 1) * chunk_size
			
 
				-        if i == num_workers - 1:
			
 
				-            end = len(file_list)
			
 
				-
			
 
				-        p = Process(
			
 
				-            target=chunked_tarring,
			
 
				-            args=(i, file_list[start:end], base_folder, output_folder),
			
 
				-        )
			
 
				-        p.start()
			
 
				-        processes.append(p)
			
 
				-
			
 
				-    for p in processes:
			
 
				-        p.join()
			
 
				-
			
 
				-    print("Done")
			
--- a/tools/wenet_clean/launch.py
+++ b/tools/wenet_clean/launch.py
@@ -1,27 +0,0 @@
 
				-import os
			
 
				-import subprocess as sp
			
 
				-import sys
			
 
				-
			
 
				-SLURM_NTASKS = 6
			
 
				-
			
 
				-processes = []
			
 
				-for i in range(SLURM_NTASKS):
			
 
				-    env = os.environ.copy()
			
 
				-    env["SLURM_PROCID"] = str(i)
			
 
				-    env["SLURM_NTASKS"] = str(SLURM_NTASKS)
			
 
				-    env["CUDA_VISIBLE_DEVICES"] = str(i % 8)
			
 
				-
			
 
				-    processes.append(
			
 
				-        sp.Popen(
			
 
				-            f"python preparing_data/wenet_clean/clean_wenet_speech.py",
			
 
				-            shell=True,
			
 
				-            env=env,
			
 
				-            stdout=sys.stdout,
			
 
				-            stderr=sys.stderr,
			
 
				-        )
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-for p in processes:
			
 
				-    p.wait()
			
 
				-    print(p.communicate())