Files
tts/scripts/generate/test_voice_cloning_fixed.py
2026-01-19 10:27:41 +08:00

156 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
Voice cloning test using VoxCPM
Reference audio: hosts/ben_guanquelou.wav
Reference text: 登鹳雀楼
"""
import os
import sys
import soundfile as sf
import numpy as np
# Paths
WORKSPACE = "/root/tts"
REFERENCE_FILE = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "voice_cloning_test")
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")
# Check reference audio
if not os.path.exists(REFERENCE_FILE):
print(f"❌ Reference audio not found: {REFERENCE_FILE}")
sys.exit(1)
print(f"✅ Reference audio: {REFERENCE_FILE}")
# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")
# Import VoxCPM
try:
from voxcpm.core import VoxCPM
print(f"✅ VoxCPM imported successfully")
except Exception as e:
print(f"❌ Failed to import VoxCPM: {e}")
sys.exit(1)
# Model path
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
print(f"❌ Model path not found")
sys.exit(1)
print(f"✅ Model path: {LOCAL_MODEL_PATH}")
# Initialize VoxCPM
print(f"\n🚀 Initializing VoxCPM...")
try:
model = VoxCPM(
voxcpm_model_path=LOCAL_MODEL_PATH,
enable_denoiser=False,
optimize=False
)
print(f"✅ VoxCPM initialized successfully")
except Exception as e:
print(f"❌ VoxCPM initialization failed: {e}")
sys.exit(1)
# Text preprocessing function (handle numbers)
def preprocess_text(text):
"""Convert numbers to words for better pronunciation"""
text = text.replace("2001", "two thousand and one")
text = text.replace("2009", "two thousand and nine")
text = text.replace("2008", "two thousand and eight")
text = text.replace("70%", "seventy percent")
text = text.replace("10", "ten")
return text
# Test texts
TEST_TEXTS = [
{
"id": "test1",
"text": "Hello, this is a voice cloning test using VoxCPM. I am speaking in English to demonstrate the voice cloning capability. The system captures my tone, rhythm, and speaking style from the reference audio.",
"filename": "test1_intro.wav"
},
{
"id": "test2",
"text": "Between two thousand and one and two thousand and nine, China used patience to get its entry ticket to the world factory. This period was crucial for China's economic rise and global integration.",
"filename": "test2_chapter8.wav"
},
{
"id": "test3",
"text": "The year two thousand and eight was a turning point. While the United States faced the subprime mortgage crisis, China hosted the Beijing Olympics and demonstrated its growing global influence.",
"filename": "test3_2008.wav"
}
]
# Reference text for voice cloning (登鹳雀楼)
REFERENCE_TEXT = "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
# Generate cloned voice
print(f"\n{'='*70}")
print(f"STARTING VOICE CLONING TEST")
print(f"{'='*70}")
print(f"Reference text: {REFERENCE_TEXT}")
for test in TEST_TEXTS:
print(f"\n🎙️ Generating test: {test['id']}")
print(f"Text: {test['text'][:50]}...")
# Preprocess text
processed_text = preprocess_text(test['text'])
print(f"Processed: {processed_text[:50]}...")
try:
# Generate audio with voice cloning
audio = model.generate(
text=processed_text,
prompt_wav_path=REFERENCE_FILE, # Use reference audio for cloning
prompt_text=REFERENCE_TEXT, # Provide reference text
cfg_value=2.0,
inference_timesteps=20,
normalize=True,
denoise=False,
retry_badcase=True
)
# Save audio
output_file = os.path.join(OUTPUT_DIR, test['filename'])
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Voice cloning successful!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save audio")
except Exception as e:
print(f"❌ Error generating audio: {e}")
import traceback
traceback.print_exc()
# Summary
print(f"\n{'='*70}")
print(f"VOICE CLONING TEST COMPLETE")
print(f"{'='*70}")
print(f"Reference audio: {REFERENCE_FILE}")
print(f"Reference text: {REFERENCE_TEXT}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nGenerated files:")
for test in TEST_TEXTS:
output_file = os.path.join(OUTPUT_DIR, test['filename'])
if os.path.exists(output_file):
size = os.path.getsize(output_file)
print(f" - {test['filename']} ({size} bytes)")
else:
print(f" - {test['filename']} (FAILED)")
print(f"\n{'='*70}")