Initial commit for TTS project
This commit is contained in:
167
scripts/generation/generate_accent_demo_local.py
Normal file
167
scripts/generation/generate_accent_demo_local.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Accent demo generator using LOCAL VoxCPM model
|
||||
Using the same successful parameters as the Ben voice cloning
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
|
||||
# Paths
|
||||
WORKSPACE = "/root/tts"
|
||||
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
||||
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
|
||||
|
||||
# Add VoxCPM to path
|
||||
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
||||
print(f"✅ Added VoxCPM path")
|
||||
|
||||
# Import VoxCPM
|
||||
try:
|
||||
from voxcpm.core import VoxCPM
|
||||
print(f"✅ VoxCPM imported successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to import VoxCPM: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Use LOCAL model (same as successful Ben voice cloning)
|
||||
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
||||
if not os.path.exists(LOCAL_MODEL_PATH):
|
||||
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
||||
if not os.path.exists(LOCAL_MODEL_PATH):
|
||||
print(f"❌ Local model path not found")
|
||||
sys.exit(1)
|
||||
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
print(f"✅ Output directory: {OUTPUT_DIR}")
|
||||
|
||||
# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
|
||||
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
|
||||
try:
|
||||
model = VoxCPM(
|
||||
voxcpm_model_path=LOCAL_MODEL_PATH,
|
||||
enable_denoiser=False, # Disable denoiser for better quality
|
||||
optimize=False # Disable optimization to avoid issues
|
||||
)
|
||||
print(f"✅ VoxCPM initialized successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ VoxCPM initialization failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Use REAL reference audio files (the ones that worked for Ben)
|
||||
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
|
||||
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
|
||||
|
||||
print(f"✅ Ben reference audio: {REAL_BEN_REF}")
|
||||
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
|
||||
|
||||
# Reference texts that MATCH the audio
|
||||
REFERENCE_TEXTS = {
|
||||
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
|
||||
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。"
|
||||
}
|
||||
|
||||
def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
|
||||
"""Generate accent demo using REAL reference audio"""
|
||||
|
||||
# Use Ben's reference audio as base (since it worked well)
|
||||
ref_audio = REAL_BEN_REF
|
||||
ref_text = REFERENCE_TEXTS["ben"]
|
||||
|
||||
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
||||
print(f"\n🎙️ Generating {accent_name} accent demo...")
|
||||
print(f"Text: {text[:50]}...")
|
||||
|
||||
try:
|
||||
# Generate audio with the SAME parameters as successful Ben voice cloning
|
||||
audio = model.generate(
|
||||
text=text,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=2.0, # Same as successful Ben
|
||||
inference_timesteps=20, # Same as successful Ben
|
||||
normalize=True, # Enable text normalization
|
||||
denoise=False, # Disable denoise
|
||||
retry_badcase=True # Enable retry for bad cases
|
||||
)
|
||||
|
||||
# Save audio
|
||||
sf.write(output_file, audio, model.tts_model.sample_rate)
|
||||
|
||||
# Verify
|
||||
if os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
duration = len(audio) / model.tts_model.sample_rate
|
||||
print(f"✅ Generated successfully!")
|
||||
print(f" File: {output_file}")
|
||||
print(f" Size: {file_size} bytes")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
else:
|
||||
print(f"❌ Failed to save")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
|
||||
"""Generate Cantonese pinyin demo"""
|
||||
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
||||
print(f"\n🎙️ Generating Cantonese pinyin demo...")
|
||||
print(f"Text: {text[:50]}...")
|
||||
|
||||
try:
|
||||
# Generate audio with the SAME parameters
|
||||
audio = model.generate(
|
||||
text=pinyin,
|
||||
prompt_wav_path=REAL_BEN_REF, # Use Ben's reference
|
||||
prompt_text=REFERENCE_TEXTS["ben"],
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20,
|
||||
normalize=True,
|
||||
denoise=False,
|
||||
retry_badcase=True
|
||||
)
|
||||
|
||||
# Save audio
|
||||
sf.write(output_file, audio, model.tts_model.sample_rate)
|
||||
|
||||
# Verify
|
||||
if os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
duration = len(audio) / model.tts_model.sample_rate
|
||||
print(f"✅ Generated successfully!")
|
||||
print(f" File: {output_file}")
|
||||
print(f" Size: {file_size} bytes")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
else:
|
||||
print(f"❌ Failed to save")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test sentence (same as before)
|
||||
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
|
||||
|
||||
# Generate accent demos using REAL reference audio
|
||||
accents = ["indian", "russian", "singaporean", "hongkong"]
|
||||
for accent in accents:
|
||||
generate_accent_demo_with_real_reference(test_text, accent)
|
||||
|
||||
# Generate Cantonese pinyin demo
|
||||
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
|
||||
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
|
||||
generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"ACCENT DEMOS GENERATION COMPLETE")
|
||||
print(f"{'='*70}")
|
||||
print(f"Output directory: {OUTPUT_DIR}")
|
||||
print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")
|
||||
Reference in New Issue
Block a user