Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generation/generate_accent_demo_local.py
+++ b/scripts/generation/generate_accent_demo_local.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Accent demo generator using LOCAL VoxCPM model
+Using the same successful parameters as the Ben voice cloning
+"""
+
+import os
+import sys
+import soundfile as sf
+import numpy as np
+
+# Paths
+WORKSPACE = "/root/tts"
+VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
+OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
+
+# Add VoxCPM to path
+sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
+print(f"✅ Added VoxCPM path")
+
+# Import VoxCPM
+try:
+    from voxcpm.core import VoxCPM
+    print(f"✅ VoxCPM imported successfully")
+except Exception as e:
+    print(f"❌ Failed to import VoxCPM: {e}")
+    sys.exit(1)
+
+# Use LOCAL model (same as successful Ben voice cloning)
+LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
+if not os.path.exists(LOCAL_MODEL_PATH):
+    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
+    if not os.path.exists(LOCAL_MODEL_PATH):
+        print(f"❌ Local model path not found")
+        sys.exit(1)
+print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
+
+# Ensure output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+print(f"✅ Output directory: {OUTPUT_DIR}")
+
+# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
+print(f"\n🚀 Initializing VoxCPM with successful parameters...")
+try:
+    model = VoxCPM(
+        voxcpm_model_path=LOCAL_MODEL_PATH,
+        enable_denoiser=False,  # Disable denoiser for better quality
+        optimize=False          # Disable optimization to avoid issues
+    )
+    print(f"✅ VoxCPM initialized successfully")
+except Exception as e:
+    print(f"❌ VoxCPM initialization failed: {e}")
+    sys.exit(1)
+
+# Use REAL reference audio files (the ones that worked for Ben)
+REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
+REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
+
+print(f"✅ Ben reference audio: {REAL_BEN_REF}")
+print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
+
+# Reference texts that MATCH the audio
+REFERENCE_TEXTS = {
+    "ben": "白日依山尽，黄河入海流。欲穷千里目，更上一层楼。",
+    "judy": "题西林壁，横看成岭侧成峰，远近高低各不同。不识庐山真面目，只缘身在此山中。"
+}
+
+def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
+    """Generate accent demo using REAL reference audio"""
+    
+    # Use Ben's reference audio as base (since it worked well)
+    ref_audio = REAL_BEN_REF
+    ref_text = REFERENCE_TEXTS["ben"]
+    
+    output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
+    print(f"\n🎙️ Generating {accent_name} accent demo...")
+    print(f"Text: {text[:50]}...")
+    
+    try:
+        # Generate audio with the SAME parameters as successful Ben voice cloning
+        audio = model.generate(
+            text=text,
+            prompt_wav_path=ref_audio,
+            prompt_text=ref_text,
+            cfg_value=2.0,              # Same as successful Ben
+            inference_timesteps=20,      # Same as successful Ben
+            normalize=True,              # Enable text normalization
+            denoise=False,               # Disable denoise
+            retry_badcase=True           # Enable retry for bad cases
+        )
+        
+        # Save audio
+        sf.write(output_file, audio, model.tts_model.sample_rate)
+        
+        # Verify
+        if os.path.exists(output_file):
+            file_size = os.path.getsize(output_file)
+            duration = len(audio) / model.tts_model.sample_rate
+            print(f"✅ Generated successfully!")
+            print(f"   File: {output_file}")
+            print(f"   Size: {file_size} bytes")
+            print(f"   Duration: {duration:.2f} seconds")
+        else:
+            print(f"❌ Failed to save")
+            
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
+    """Generate Cantonese pinyin demo"""
+    output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
+    print(f"\n🎙️ Generating Cantonese pinyin demo...")
+    print(f"Text: {text[:50]}...")
+    
+    try:
+        # Generate audio with the SAME parameters
+        audio = model.generate(
+            text=pinyin,
+            prompt_wav_path=REAL_BEN_REF,  # Use Ben's reference
+            prompt_text=REFERENCE_TEXTS["ben"],
+            cfg_value=2.0,
+            inference_timesteps=20,
+            normalize=True,
+            denoise=False,
+            retry_badcase=True
+        )
+        
+        # Save audio
+        sf.write(output_file, audio, model.tts_model.sample_rate)
+        
+        # Verify
+        if os.path.exists(output_file):
+            file_size = os.path.getsize(output_file)
+            duration = len(audio) / model.tts_model.sample_rate
+            print(f"✅ Generated successfully!")
+            print(f"   File: {output_file}")
+            print(f"   Size: {file_size} bytes")
+            print(f"   Duration: {duration:.2f} seconds")
+        else:
+            print(f"❌ Failed to save")
+            
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    # Test sentence (same as before)
+    test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
+    
+    # Generate accent demos using REAL reference audio
+    accents = ["indian", "russian", "singaporean", "hongkong"]
+    for accent in accents:
+        generate_accent_demo_with_real_reference(test_text, accent)
+    
+    # Generate Cantonese pinyin demo
+    cantonese_text = "张学友是香港著名歌手，被誉为歌神。"
+    cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
+    generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
+    
+    print(f"\n{'='*70}")
+    print(f"ACCENT DEMOS GENERATION COMPLETE")
+    print(f"{'='*70}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")