tts/scripts/generation/generate_accent_demo_local.py

#!/usr/bin/env python3
"""
Accent demo generator using LOCAL VoxCPM model
Using the same successful parameters as the Ben voice cloning
"""

import os
import sys
import soundfile as sf
import numpy as np

# Paths
WORKSPACE = "/root/tts"
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")

# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")

# Import VoxCPM
try:
    from voxcpm.core import VoxCPM
    print(f"✅ VoxCPM imported successfully")
except Exception as e:
    print(f"❌ Failed to import VoxCPM: {e}")
    sys.exit(1)

# Use LOCAL model (same as successful Ben voice cloning)
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f"❌ Local model path not found")
        sys.exit(1)
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")

# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
try:
    model = VoxCPM(
        voxcpm_model_path=LOCAL_MODEL_PATH,
        enable_denoiser=False,  # Disable denoiser for better quality
        optimize=False          # Disable optimization to avoid issues
    )
    print(f"✅ VoxCPM initialized successfully")
except Exception as e:
    print(f"❌ VoxCPM initialization failed: {e}")
    sys.exit(1)

# Use REAL reference audio files (the ones that worked for Ben)
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")

print(f"✅ Ben reference audio: {REAL_BEN_REF}")
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")

# Reference texts that MATCH the audio
REFERENCE_TEXTS = {
    "ben": "白日依山尽，黄河入海流。欲穷千里目，更上一层楼。",
    "judy": "题西林壁，横看成岭侧成峰，远近高低各不同。不识庐山真面目，只缘身在此山中。"
}

def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
    """Generate accent demo using REAL reference audio"""

    # Use Ben's reference audio as base (since it worked well)
    ref_audio = REAL_BEN_REF
    ref_text = REFERENCE_TEXTS["ben"]

    output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
    print(f"\n🎙️ Generating {accent_name} accent demo...")
    print(f"Text: {text[:50]}...")

    try:
        # Generate audio with the SAME parameters as successful Ben voice cloning
        audio = model.generate(
            text=text,
            prompt_wav_path=ref_audio,
            prompt_text=ref_text,
            cfg_value=2.0,              # Same as successful Ben
            inference_timesteps=20,      # Same as successful Ben
            normalize=True,              # Enable text normalization
            denoise=False,               # Disable denoise
            retry_badcase=True           # Enable retry for bad cases
        )

        # Save audio
        sf.write(output_file, audio, model.tts_model.sample_rate)

        # Verify
        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            duration = len(audio) / model.tts_model.sample_rate
            print(f"✅ Generated successfully!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            print(f"   Duration: {duration:.2f} seconds")
        else:
            print(f"❌ Failed to save")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
    """Generate Cantonese pinyin demo"""
    output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
    print(f"\n🎙️ Generating Cantonese pinyin demo...")
    print(f"Text: {text[:50]}...")

    try:
        # Generate audio with the SAME parameters
        audio = model.generate(
            text=pinyin,
            prompt_wav_path=REAL_BEN_REF,  # Use Ben's reference
            prompt_text=REFERENCE_TEXTS["ben"],
            cfg_value=2.0,
            inference_timesteps=20,
            normalize=True,
            denoise=False,
            retry_badcase=True
        )

        # Save audio
        sf.write(output_file, audio, model.tts_model.sample_rate)

        # Verify
        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            duration = len(audio) / model.tts_model.sample_rate
            print(f"✅ Generated successfully!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            print(f"   Duration: {duration:.2f} seconds")
        else:
            print(f"❌ Failed to save")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    # Test sentence (same as before)
    test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"

    # Generate accent demos using REAL reference audio
    accents = ["indian", "russian", "singaporean", "hongkong"]
    for accent in accents:
        generate_accent_demo_with_real_reference(test_text, accent)

    # Generate Cantonese pinyin demo
    cantonese_text = "张学友是香港著名歌手，被誉为歌神。"
    cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
    generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)

    print(f"\n{'='*70}")
    print(f"ACCENT DEMOS GENERATION COMPLETE")
    print(f"{'='*70}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")