206 lines
7.2 KiB
Python
206 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Accent verification test for VoxCPM
|
|
Using different reference audios for different accents
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import soundfile as sf
|
|
import numpy as np
|
|
|
|
# Paths
|
|
WORKSPACE = "/root/tts"
|
|
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
|
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_verification")
|
|
|
|
# Add VoxCPM to path
|
|
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
|
print(f"✅ Added VoxCPM path")
|
|
|
|
# Import VoxCPM
|
|
try:
|
|
from voxcpm.core import VoxCPM
|
|
print(f"✅ VoxCPM imported successfully")
|
|
except Exception as e:
|
|
print(f"❌ Failed to import VoxCPM: {e}")
|
|
sys.exit(1)
|
|
|
|
# Use LOCAL model
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
print(f"❌ Local model path not found")
|
|
sys.exit(1)
|
|
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
|
|
|
|
# Ensure output directory exists
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
print(f"✅ Output directory: {OUTPUT_DIR}")
|
|
|
|
# Initialize VoxCPM
|
|
print(f"\n🚀 Initializing VoxCPM...")
|
|
try:
|
|
model = VoxCPM(
|
|
voxcpm_model_path=LOCAL_MODEL_PATH,
|
|
enable_denoiser=False,
|
|
optimize=False
|
|
)
|
|
print(f"✅ VoxCPM initialized successfully")
|
|
except Exception as e:
|
|
print(f"❌ VoxCPM initialization failed: {e}")
|
|
sys.exit(1)
|
|
|
|
# Test sentence
|
|
test_sentence = "Hello everyone! I'm speaking with a different accent today. How does it sound to you?"
|
|
|
|
# Create accent-specific reference audios
|
|
def create_accent_reference(accent_name, description):
|
|
"""Create reference audio for specific accent"""
|
|
ref_file = os.path.join(WORKSPACE, f"reference_{accent_name}.wav")
|
|
|
|
# Create accent-specific reference text
|
|
ref_texts = {
|
|
"indian": "Namaste! How are you doing today? I'm from India. The weather here is quite warm and sunny.",
|
|
"british": "Hello there! How are you today? I'm from London. The weather here is quite rainy and cold.",
|
|
"american": "Hey! What's up? I'm from New York. The weather here is pretty nice today.",
|
|
"australian": "G'day mate! How ya goin'? I'm from Sydney. The weather here is bloody fantastic!",
|
|
"russian": "Privet! Kak dela? I'm from Moscow. The weather here is very cold with snow.",
|
|
"singaporean": "Hi there! How's it going? I'm from Singapore. We have delicious hawker food here.",
|
|
"hongkong": "Nei ho! How are you? I'm from Hong Kong. It's a busy city with great food."
|
|
}
|
|
|
|
ref_text = ref_texts.get(accent_name, ref_texts["american"])
|
|
|
|
if not os.path.exists(ref_file):
|
|
print(f"🎙️ Creating {accent_name} accent reference...")
|
|
print(f"Reference text: {ref_text}")
|
|
|
|
# Generate reference audio with distinct characteristics
|
|
audio = model.generate(
|
|
text=ref_text,
|
|
cfg_value=2.5,
|
|
inference_timesteps=20,
|
|
normalize=True
|
|
)
|
|
|
|
sf.write(ref_file, audio, model.tts_model.sample_rate)
|
|
print(f"✅ Created {accent_name} reference: {ref_file}")
|
|
|
|
return ref_file, ref_text
|
|
|
|
# Test different accents
|
|
def test_accent(accent_name, description):
|
|
"""Test accent generation"""
|
|
ref_audio, ref_text = create_accent_reference(accent_name, description)
|
|
|
|
output_file = os.path.join(OUTPUT_DIR, f"{accent_name}_accent_test.wav")
|
|
print(f"\n🎯 Testing {accent_name} accent...")
|
|
print(f"Test sentence: {test_sentence}")
|
|
|
|
try:
|
|
# Generate audio with accent
|
|
audio = model.generate(
|
|
text=test_sentence,
|
|
prompt_wav_path=ref_audio,
|
|
prompt_text=ref_text,
|
|
cfg_value=2.0,
|
|
inference_timesteps=20,
|
|
normalize=True,
|
|
retry_badcase=True
|
|
)
|
|
|
|
# Save audio
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
# Verify
|
|
if os.path.exists(output_file):
|
|
file_size = os.path.getsize(output_file)
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Generated successfully!")
|
|
print(f" File: {output_file}")
|
|
print(f" Size: {file_size} bytes")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Test emotion capability
|
|
def test_emotion():
|
|
"""Test emotion expression capability"""
|
|
emotions = {
|
|
"happy": "Wow! I'm so excited and happy today! Everything is going great!",
|
|
"sad": "I'm feeling very sad and lonely today. Nothing seems to be going right.",
|
|
"angry": "I'm really angry and frustrated! This is completely unacceptable!",
|
|
"calm": "I'm feeling very calm and peaceful today. Everything is quiet and serene."
|
|
}
|
|
|
|
for emotion, ref_text in emotions.items():
|
|
output_file = os.path.join(OUTPUT_DIR, f"{emotion}_emotion_test.wav")
|
|
print(f"\n😊 Testing {emotion} emotion...")
|
|
|
|
try:
|
|
# Generate audio with emotion
|
|
audio = model.generate(
|
|
text=test_sentence,
|
|
prompt_wav_path=None, # Let model infer emotion from text
|
|
prompt_text=ref_text,
|
|
cfg_value=2.5,
|
|
inference_timesteps=20,
|
|
normalize=True
|
|
)
|
|
|
|
# Save audio
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
if os.path.exists(output_file):
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Generated {emotion} emotion: {output_file}")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
print(f"{'='*70}")
|
|
print(f"VOXCPM ACCENT AND EMOTION VERIFICATION TEST")
|
|
print(f"{'='*70}")
|
|
|
|
# Test different accents
|
|
accents = [
|
|
("indian", "Indian English accent"),
|
|
("british", "British English accent"),
|
|
("american", "American English accent"),
|
|
("australian", "Australian English accent"),
|
|
("russian", "Russian English accent"),
|
|
("singaporean", "Singaporean English accent"),
|
|
("hongkong", "Hong Kong English accent")
|
|
]
|
|
|
|
for accent_name, description in accents:
|
|
test_accent(accent_name, description)
|
|
|
|
# Test emotion capability
|
|
print(f"\n{'='*70}")
|
|
print(f"TESTING EMOTION EXPRESSION CAPABILITY")
|
|
print(f"{'='*70}")
|
|
test_emotion()
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"VERIFICATION TEST COMPLETE")
|
|
print(f"{'='*70}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"\n📋 Generated files:")
|
|
for accent_name, _ in accents:
|
|
print(f" - {accent_name}_accent_test.wav")
|
|
for emotion in ["happy", "sad", "angry", "calm"]:
|
|
print(f" - {emotion}_emotion_test.wav")
|
|
print(f"\n🎧 Please listen to the files to verify accent and emotion differences!")
|