Files
tts/scripts/generation/generate_accent_demo_local.py
2026-01-19 10:27:41 +08:00

167 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Accent demo generator using LOCAL VoxCPM model
Using the same successful parameters as the Ben voice cloning
"""
import os
import sys
import soundfile as sf
import numpy as np
# Paths
WORKSPACE = "/root/tts"
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")
# Import VoxCPM
try:
from voxcpm.core import VoxCPM
print(f"✅ VoxCPM imported successfully")
except Exception as e:
print(f"❌ Failed to import VoxCPM: {e}")
sys.exit(1)
# Use LOCAL model (same as successful Ben voice cloning)
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
print(f"❌ Local model path not found")
sys.exit(1)
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")
# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
try:
model = VoxCPM(
voxcpm_model_path=LOCAL_MODEL_PATH,
enable_denoiser=False, # Disable denoiser for better quality
optimize=False # Disable optimization to avoid issues
)
print(f"✅ VoxCPM initialized successfully")
except Exception as e:
print(f"❌ VoxCPM initialization failed: {e}")
sys.exit(1)
# Use REAL reference audio files (the ones that worked for Ben)
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
print(f"✅ Ben reference audio: {REAL_BEN_REF}")
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
# Reference texts that MATCH the audio
REFERENCE_TEXTS = {
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。"
}
def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
"""Generate accent demo using REAL reference audio"""
# Use Ben's reference audio as base (since it worked well)
ref_audio = REAL_BEN_REF
ref_text = REFERENCE_TEXTS["ben"]
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
print(f"\n🎙️ Generating {accent_name} accent demo...")
print(f"Text: {text[:50]}...")
try:
# Generate audio with the SAME parameters as successful Ben voice cloning
audio = model.generate(
text=text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=2.0, # Same as successful Ben
inference_timesteps=20, # Same as successful Ben
normalize=True, # Enable text normalization
denoise=False, # Disable denoise
retry_badcase=True # Enable retry for bad cases
)
# Save audio
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Generated successfully!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
"""Generate Cantonese pinyin demo"""
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
print(f"\n🎙️ Generating Cantonese pinyin demo...")
print(f"Text: {text[:50]}...")
try:
# Generate audio with the SAME parameters
audio = model.generate(
text=pinyin,
prompt_wav_path=REAL_BEN_REF, # Use Ben's reference
prompt_text=REFERENCE_TEXTS["ben"],
cfg_value=2.0,
inference_timesteps=20,
normalize=True,
denoise=False,
retry_badcase=True
)
# Save audio
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Generated successfully!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# Test sentence (same as before)
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
# Generate accent demos using REAL reference audio
accents = ["indian", "russian", "singaporean", "hongkong"]
for accent in accents:
generate_accent_demo_with_real_reference(test_text, accent)
# Generate Cantonese pinyin demo
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
print(f"\n{'='*70}")
print(f"ACCENT DEMOS GENERATION COMPLETE")
print(f"{'='*70}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")