Files
tts/scripts/generation/generate_accent_demo_optimized.py
2026-01-19 10:27:41 +08:00

119 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
Optimized accent demo generator using VoxCPM
Improved version with better parameters and shorter text
"""
import os
import numpy as np
import soundfile as sf
from voxcpm import VoxCPM
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos_optimized"):
"""Generate optimized accent demo audio"""
os.makedirs(output_dir, exist_ok=True)
# Define reference audio paths
ref_audio_map = {
"indian": "reference_indian_opt.wav",
"russian": "reference_russian_opt.wav",
"singaporean": "reference_singaporean_opt.wav",
"hongkong": "reference_hongkong_opt.wav"
}
# Define better reference texts (shorter, more natural)
ref_text_map = {
"indian": "Hello there! How are you today? I'm from India. The weather here is quite warm.",
"russian": "Hello! How are you doing? I'm from Russia. The winters here are very cold.",
"singaporean": "Hi! How's it going? I'm from Singapore. We have delicious hawker food here.",
"hongkong": "Hey! How are you? I'm from Hong Kong. It's a bustling city with amazing food."
}
ref_audio = ref_audio_map.get(accent_name)
ref_text = ref_text_map.get(accent_name)
if not ref_audio or not ref_text:
print(f"Invalid accent name: {accent_name}")
return
# Generate high-quality reference audio
if not os.path.exists(ref_audio):
print(f"Generating optimized reference audio for {accent_name}...")
audio = model.generate(
text=ref_text,
cfg_value=3.0, # Higher CFG for better quality
inference_timesteps=30 # More steps for better quality
)
sf.write(ref_audio, audio, 24000)
print(f"Generated optimized reference audio: {ref_audio}")
# Generate accent demo with optimized parameters
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
print(f"Generating optimized {accent_name} accent demo...")
audio = model.generate(
text=text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=3.0, # Higher CFG for better adherence to prompt
inference_timesteps=30 # More steps for better quality
)
sf.write(output_file, audio, 24000)
print(f"Generated optimized {accent_name} accent demo: {output_file}")
return output_file
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos_optimized"):
"""Generate optimized Cantonese pinyin demo"""
os.makedirs(output_dir, exist_ok=True)
# Generate better Cantonese reference audio
ref_audio = "reference_cantonese_opt.wav"
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌。"
if not os.path.exists(ref_audio):
print("Generating optimized Cantonese reference audio...")
audio = model.generate(
text=ref_text,
cfg_value=3.0,
inference_timesteps=30
)
sf.write(ref_audio, audio, 24000)
print(f"Generated optimized Cantonese reference audio: {ref_audio}")
# Generate Cantonese pinyin demo
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
print("Generating optimized Cantonese pinyin demo...")
audio = model.generate(
text=pinyin,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=3.0,
inference_timesteps=30
)
sf.write(output_file, audio, 24000)
print(f"Generated optimized Cantonese pinyin demo: {output_file}")
return output_file
if __name__ == "__main__":
# Initialize VoxCPM
print("Initializing VoxCPM...")
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
# Shorter test text for better results
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
# Generate optimized accent demos
accents = ["indian", "russian", "singaporean", "hongkong"]
for accent in accents:
generate_accent_demo(model, test_text, accent)
# Generate optimized Cantonese pinyin demo
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
print("All optimized demos generated successfully!")