167 lines
6.0 KiB
Python
167 lines
6.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Accent demo generator using LOCAL VoxCPM model
|
|
Using the same successful parameters as the Ben voice cloning
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import soundfile as sf
|
|
import numpy as np
|
|
|
|
# Paths
|
|
WORKSPACE = "/root/tts"
|
|
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
|
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
|
|
|
|
# Add VoxCPM to path
|
|
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
|
print(f"✅ Added VoxCPM path")
|
|
|
|
# Import VoxCPM
|
|
try:
|
|
from voxcpm.core import VoxCPM
|
|
print(f"✅ VoxCPM imported successfully")
|
|
except Exception as e:
|
|
print(f"❌ Failed to import VoxCPM: {e}")
|
|
sys.exit(1)
|
|
|
|
# Use LOCAL model (same as successful Ben voice cloning)
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
print(f"❌ Local model path not found")
|
|
sys.exit(1)
|
|
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
|
|
|
|
# Ensure output directory exists
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
print(f"✅ Output directory: {OUTPUT_DIR}")
|
|
|
|
# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
|
|
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
|
|
try:
|
|
model = VoxCPM(
|
|
voxcpm_model_path=LOCAL_MODEL_PATH,
|
|
enable_denoiser=False, # Disable denoiser for better quality
|
|
optimize=False # Disable optimization to avoid issues
|
|
)
|
|
print(f"✅ VoxCPM initialized successfully")
|
|
except Exception as e:
|
|
print(f"❌ VoxCPM initialization failed: {e}")
|
|
sys.exit(1)
|
|
|
|
# Use REAL reference audio files (the ones that worked for Ben)
|
|
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
|
|
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
|
|
|
|
print(f"✅ Ben reference audio: {REAL_BEN_REF}")
|
|
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
|
|
|
|
# Reference texts that MATCH the audio
|
|
REFERENCE_TEXTS = {
|
|
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
|
|
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。"
|
|
}
|
|
|
|
def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
|
|
"""Generate accent demo using REAL reference audio"""
|
|
|
|
# Use Ben's reference audio as base (since it worked well)
|
|
ref_audio = REAL_BEN_REF
|
|
ref_text = REFERENCE_TEXTS["ben"]
|
|
|
|
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
|
print(f"\n🎙️ Generating {accent_name} accent demo...")
|
|
print(f"Text: {text[:50]}...")
|
|
|
|
try:
|
|
# Generate audio with the SAME parameters as successful Ben voice cloning
|
|
audio = model.generate(
|
|
text=text,
|
|
prompt_wav_path=ref_audio,
|
|
prompt_text=ref_text,
|
|
cfg_value=2.0, # Same as successful Ben
|
|
inference_timesteps=20, # Same as successful Ben
|
|
normalize=True, # Enable text normalization
|
|
denoise=False, # Disable denoise
|
|
retry_badcase=True # Enable retry for bad cases
|
|
)
|
|
|
|
# Save audio
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
# Verify
|
|
if os.path.exists(output_file):
|
|
file_size = os.path.getsize(output_file)
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Generated successfully!")
|
|
print(f" File: {output_file}")
|
|
print(f" Size: {file_size} bytes")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
|
|
"""Generate Cantonese pinyin demo"""
|
|
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
|
print(f"\n🎙️ Generating Cantonese pinyin demo...")
|
|
print(f"Text: {text[:50]}...")
|
|
|
|
try:
|
|
# Generate audio with the SAME parameters
|
|
audio = model.generate(
|
|
text=pinyin,
|
|
prompt_wav_path=REAL_BEN_REF, # Use Ben's reference
|
|
prompt_text=REFERENCE_TEXTS["ben"],
|
|
cfg_value=2.0,
|
|
inference_timesteps=20,
|
|
normalize=True,
|
|
denoise=False,
|
|
retry_badcase=True
|
|
)
|
|
|
|
# Save audio
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
# Verify
|
|
if os.path.exists(output_file):
|
|
file_size = os.path.getsize(output_file)
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Generated successfully!")
|
|
print(f" File: {output_file}")
|
|
print(f" Size: {file_size} bytes")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
# Test sentence (same as before)
|
|
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
|
|
|
|
# Generate accent demos using REAL reference audio
|
|
accents = ["indian", "russian", "singaporean", "hongkong"]
|
|
for accent in accents:
|
|
generate_accent_demo_with_real_reference(test_text, accent)
|
|
|
|
# Generate Cantonese pinyin demo
|
|
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
|
|
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
|
|
generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"ACCENT DEMOS GENERATION COMPLETE")
|
|
print(f"{'='*70}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!") |