tts/scripts/generate/test_voice_cloning_fixed.py

#!/usr/bin/env python3
"""
Voice cloning test using VoxCPM
Reference audio: hosts/ben_guanquelou.wav
Reference text: 登鹳雀楼
"""
import os
import sys
import soundfile as sf
import numpy as np

# Paths
WORKSPACE = "/root/tts"
REFERENCE_FILE = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "voice_cloning_test")
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")

# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")

# Check reference audio
if not os.path.exists(REFERENCE_FILE):
    print(f"❌ Reference audio not found: {REFERENCE_FILE}")
    sys.exit(1)
print(f"✅ Reference audio: {REFERENCE_FILE}")

# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")

# Import VoxCPM
try:
    from voxcpm.core import VoxCPM
    print(f"✅ VoxCPM imported successfully")
except Exception as e:
    print(f"❌ Failed to import VoxCPM: {e}")
    sys.exit(1)

# Model path
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f"❌ Model path not found")
        sys.exit(1)
print(f"✅ Model path: {LOCAL_MODEL_PATH}")

# Initialize VoxCPM
print(f"\n🚀 Initializing VoxCPM...")
try:
    model = VoxCPM(
        voxcpm_model_path=LOCAL_MODEL_PATH,
        enable_denoiser=False,
        optimize=False
    )
    print(f"✅ VoxCPM initialized successfully")
except Exception as e:
    print(f"❌ VoxCPM initialization failed: {e}")
    sys.exit(1)

# Text preprocessing function (handle numbers)
def preprocess_text(text):
    """Convert numbers to words for better pronunciation"""
    text = text.replace("2001", "two thousand and one")
    text = text.replace("2009", "two thousand and nine")
    text = text.replace("2008", "two thousand and eight")
    text = text.replace("70%", "seventy percent")
    text = text.replace("10", "ten")
    return text

# Test texts
TEST_TEXTS = [
    {
        "id": "test1",
        "text": "Hello, this is a voice cloning test using VoxCPM. I am speaking in English to demonstrate the voice cloning capability. The system captures my tone, rhythm, and speaking style from the reference audio.",
        "filename": "test1_intro.wav"
    },
    {
        "id": "test2",
        "text": "Between two thousand and one and two thousand and nine, China used patience to get its entry ticket to the world factory. This period was crucial for China's economic rise and global integration.",
        "filename": "test2_chapter8.wav"
    },
    {
        "id": "test3",
        "text": "The year two thousand and eight was a turning point. While the United States faced the subprime mortgage crisis, China hosted the Beijing Olympics and demonstrated its growing global influence.",
        "filename": "test3_2008.wav"
    }
]

# Reference text for voice cloning (登鹳雀楼)
REFERENCE_TEXT = "白日依山尽，黄河入海流。欲穷千里目，更上一层楼。"

# Generate cloned voice
print(f"\n{'='*70}")
print(f"STARTING VOICE CLONING TEST")
print(f"{'='*70}")
print(f"Reference text: {REFERENCE_TEXT}")

for test in TEST_TEXTS:
    print(f"\n🎙️ Generating test: {test['id']}")
    print(f"Text: {test['text'][:50]}...")

    # Preprocess text
    processed_text = preprocess_text(test['text'])
    print(f"Processed: {processed_text[:50]}...")

    try:
        # Generate audio with voice cloning
        audio = model.generate(
            text=processed_text,
            prompt_wav_path=REFERENCE_FILE,  # Use reference audio for cloning
            prompt_text=REFERENCE_TEXT,      # Provide reference text
            cfg_value=2.0,
            inference_timesteps=20,
            normalize=True,
            denoise=False,
            retry_badcase=True
        )

        # Save audio
        output_file = os.path.join(OUTPUT_DIR, test['filename'])
        sf.write(output_file, audio, model.tts_model.sample_rate)

        # Verify
        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            duration = len(audio) / model.tts_model.sample_rate
            print(f"✅ Voice cloning successful!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            print(f"   Duration: {duration:.2f} seconds")
        else:
            print(f"❌ Failed to save audio")

    except Exception as e:
        print(f"❌ Error generating audio: {e}")
        import traceback
        traceback.print_exc()

# Summary
print(f"\n{'='*70}")
print(f"VOICE CLONING TEST COMPLETE")
print(f"{'='*70}")
print(f"Reference audio: {REFERENCE_FILE}")
print(f"Reference text: {REFERENCE_TEXT}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nGenerated files:")
for test in TEST_TEXTS:
    output_file = os.path.join(OUTPUT_DIR, test['filename'])
    if os.path.exists(output_file):
        size = os.path.getsize(output_file)
        print(f"   - {test['filename']} ({size} bytes)")
    else:
        print(f"   - {test['filename']} (FAILED)")
print(f"\n{'='*70}")