119 lines
4.4 KiB
Python
119 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Optimized accent demo generator using VoxCPM
|
|
Improved version with better parameters and shorter text
|
|
"""
|
|
|
|
import os
|
|
import numpy as np
|
|
import soundfile as sf
|
|
from voxcpm import VoxCPM
|
|
|
|
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos_optimized"):
|
|
"""Generate optimized accent demo audio"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Define reference audio paths
|
|
ref_audio_map = {
|
|
"indian": "reference_indian_opt.wav",
|
|
"russian": "reference_russian_opt.wav",
|
|
"singaporean": "reference_singaporean_opt.wav",
|
|
"hongkong": "reference_hongkong_opt.wav"
|
|
}
|
|
|
|
# Define better reference texts (shorter, more natural)
|
|
ref_text_map = {
|
|
"indian": "Hello there! How are you today? I'm from India. The weather here is quite warm.",
|
|
"russian": "Hello! How are you doing? I'm from Russia. The winters here are very cold.",
|
|
"singaporean": "Hi! How's it going? I'm from Singapore. We have delicious hawker food here.",
|
|
"hongkong": "Hey! How are you? I'm from Hong Kong. It's a bustling city with amazing food."
|
|
}
|
|
|
|
ref_audio = ref_audio_map.get(accent_name)
|
|
ref_text = ref_text_map.get(accent_name)
|
|
|
|
if not ref_audio or not ref_text:
|
|
print(f"Invalid accent name: {accent_name}")
|
|
return
|
|
|
|
# Generate high-quality reference audio
|
|
if not os.path.exists(ref_audio):
|
|
print(f"Generating optimized reference audio for {accent_name}...")
|
|
audio = model.generate(
|
|
text=ref_text,
|
|
cfg_value=3.0, # Higher CFG for better quality
|
|
inference_timesteps=30 # More steps for better quality
|
|
)
|
|
sf.write(ref_audio, audio, 24000)
|
|
print(f"Generated optimized reference audio: {ref_audio}")
|
|
|
|
# Generate accent demo with optimized parameters
|
|
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
|
print(f"Generating optimized {accent_name} accent demo...")
|
|
|
|
audio = model.generate(
|
|
text=text,
|
|
prompt_wav_path=ref_audio,
|
|
prompt_text=ref_text,
|
|
cfg_value=3.0, # Higher CFG for better adherence to prompt
|
|
inference_timesteps=30 # More steps for better quality
|
|
)
|
|
|
|
sf.write(output_file, audio, 24000)
|
|
print(f"Generated optimized {accent_name} accent demo: {output_file}")
|
|
return output_file
|
|
|
|
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos_optimized"):
|
|
"""Generate optimized Cantonese pinyin demo"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Generate better Cantonese reference audio
|
|
ref_audio = "reference_cantonese_opt.wav"
|
|
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌。"
|
|
|
|
if not os.path.exists(ref_audio):
|
|
print("Generating optimized Cantonese reference audio...")
|
|
audio = model.generate(
|
|
text=ref_text,
|
|
cfg_value=3.0,
|
|
inference_timesteps=30
|
|
)
|
|
sf.write(ref_audio, audio, 24000)
|
|
print(f"Generated optimized Cantonese reference audio: {ref_audio}")
|
|
|
|
# Generate Cantonese pinyin demo
|
|
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
|
print("Generating optimized Cantonese pinyin demo...")
|
|
|
|
audio = model.generate(
|
|
text=pinyin,
|
|
prompt_wav_path=ref_audio,
|
|
prompt_text=ref_text,
|
|
cfg_value=3.0,
|
|
inference_timesteps=30
|
|
)
|
|
|
|
sf.write(output_file, audio, 24000)
|
|
print(f"Generated optimized Cantonese pinyin demo: {output_file}")
|
|
return output_file
|
|
|
|
if __name__ == "__main__":
|
|
# Initialize VoxCPM
|
|
print("Initializing VoxCPM...")
|
|
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
|
|
|
|
# Shorter test text for better results
|
|
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
|
|
|
|
# Generate optimized accent demos
|
|
accents = ["indian", "russian", "singaporean", "hongkong"]
|
|
for accent in accents:
|
|
generate_accent_demo(model, test_text, accent)
|
|
|
|
# Generate optimized Cantonese pinyin demo
|
|
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
|
|
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
|
|
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
|
|
|
|
print("All optimized demos generated successfully!")
|