150 lines
4.9 KiB
Python
150 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Voice cloning test using VoxCPM
|
|
Reference audio: hosts/ben_guanquelou.wav
|
|
"""
|
|
import os
|
|
import sys
|
|
import soundfile as sf
|
|
import numpy as np
|
|
|
|
# Paths
|
|
WORKSPACE = "/root/tts"
|
|
REFERENCE_FILE = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
|
|
OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "voice_cloning_test")
|
|
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
|
|
|
# Ensure directories exist
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
print(f"✅ Output directory: {OUTPUT_DIR}")
|
|
|
|
# Check reference audio
|
|
if not os.path.exists(REFERENCE_FILE):
|
|
print(f"❌ Reference audio not found: {REFERENCE_FILE}")
|
|
sys.exit(1)
|
|
print(f"✅ Reference audio: {REFERENCE_FILE}")
|
|
|
|
# Add VoxCPM to path
|
|
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
|
print(f"✅ Added VoxCPM path")
|
|
|
|
# Import VoxCPM
|
|
try:
|
|
from voxcpm.core import VoxCPM
|
|
print(f"✅ VoxCPM imported successfully")
|
|
except Exception as e:
|
|
print(f"❌ Failed to import VoxCPM: {e}")
|
|
sys.exit(1)
|
|
|
|
# Model path
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
print(f"❌ Model path not found")
|
|
sys.exit(1)
|
|
print(f"✅ Model path: {LOCAL_MODEL_PATH}")
|
|
|
|
# Initialize VoxCPM
|
|
print(f"\n🚀 Initializing VoxCPM...")
|
|
try:
|
|
model = VoxCPM(
|
|
voxcpm_model_path=LOCAL_MODEL_PATH,
|
|
enable_denoiser=False,
|
|
optimize=False
|
|
)
|
|
print(f"✅ VoxCPM initialized successfully")
|
|
except Exception as e:
|
|
print(f"❌ VoxCPM initialization failed: {e}")
|
|
sys.exit(1)
|
|
|
|
# Text preprocessing function (handle numbers)
|
|
def preprocess_text(text):
|
|
"""Convert numbers to words for better pronunciation"""
|
|
text = text.replace("2001", "two thousand and one")
|
|
text = text.replace("2009", "two thousand and nine")
|
|
text = text.replace("2008", "two thousand and eight")
|
|
text = text.replace("70%", "seventy percent")
|
|
text = text.replace("10", "ten")
|
|
return text
|
|
|
|
# Test texts
|
|
TEST_TEXTS = [
|
|
{
|
|
"id": "test1",
|
|
"text": "Hello, this is a voice cloning test using VoxCPM. I am speaking in English to demonstrate the voice cloning capability. The system captures my tone, rhythm, and speaking style from the reference audio.",
|
|
"filename": "test1_intro.wav"
|
|
},
|
|
{
|
|
"id": "test2",
|
|
"text": "Between two thousand and one and two thousand and nine, China used patience to get its entry ticket to the world factory. This period was crucial for China's economic rise and global integration.",
|
|
"filename": "test2_chapter8.wav"
|
|
},
|
|
{
|
|
"id": "test3",
|
|
"text": "The year two thousand and eight was a turning point. While the United States faced the subprime mortgage crisis, China hosted the Beijing Olympics and demonstrated its growing global influence.",
|
|
"filename": "test3_2008.wav"
|
|
}
|
|
]
|
|
|
|
# Generate cloned voice
|
|
print(f"\n{'='*70}")
|
|
print(f"STARTING VOICE CLONING TEST")
|
|
print(f"{'='*70}")
|
|
|
|
for test in TEST_TEXTS:
|
|
print(f"\n🎙️ Generating test: {test['id']}")
|
|
print(f"Text: {test['text'][:50]}...")
|
|
|
|
# Preprocess text
|
|
processed_text = preprocess_text(test['text'])
|
|
print(f"Processed: {processed_text[:50]}...")
|
|
|
|
try:
|
|
# Generate audio with voice cloning
|
|
audio = model.generate(
|
|
text=processed_text,
|
|
prompt_wav_path=REFERENCE_FILE, # Use reference audio for cloning
|
|
prompt_text=None, # No need for reference text
|
|
cfg_value=2.0,
|
|
inference_timesteps=20,
|
|
normalize=True,
|
|
denoise=False,
|
|
retry_badcase=True
|
|
)
|
|
|
|
# Save audio
|
|
output_file = os.path.join(OUTPUT_DIR, test['filename'])
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
# Verify
|
|
if os.path.exists(output_file):
|
|
file_size = os.path.getsize(output_file)
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Voice cloning successful!")
|
|
print(f" File: {output_file}")
|
|
print(f" Size: {file_size} bytes")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save audio")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error generating audio: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Summary
|
|
print(f"\n{'='*70}")
|
|
print(f"VOICE CLONING TEST COMPLETE")
|
|
print(f"{'='*70}")
|
|
print(f"Reference audio: {REFERENCE_FILE}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"\nGenerated files:")
|
|
for test in TEST_TEXTS:
|
|
output_file = os.path.join(OUTPUT_DIR, test['filename'])
|
|
if os.path.exists(output_file):
|
|
size = os.path.getsize(output_file)
|
|
print(f" - {test['filename']} ({size} bytes)")
|
|
else:
|
|
print(f" - {test['filename']} (FAILED)")
|
|
print(f"\n{'='*70}") |