128 lines
4.0 KiB
Python
128 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fixed emotion test for VoxCPM
|
|
Using proper parameter format
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import soundfile as sf
|
|
import numpy as np
|
|
|
|
# Paths
|
|
WORKSPACE = "/root/tts"
|
|
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
|
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_verification")
|
|
|
|
# Add VoxCPM to path
|
|
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
|
|
|
# Import VoxCPM
|
|
try:
|
|
from voxcpm.core import VoxCPM
|
|
except Exception as e:
|
|
print(f"❌ Failed to import VoxCPM: {e}")
|
|
sys.exit(1)
|
|
|
|
# Use LOCAL model
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
|
if not os.path.exists(LOCAL_MODEL_PATH):
|
|
print(f"❌ Local model path not found")
|
|
sys.exit(1)
|
|
|
|
# Initialize VoxCPM
|
|
model = VoxCPM(
|
|
voxcpm_model_path=LOCAL_MODEL_PATH,
|
|
enable_denoiser=False,
|
|
optimize=False
|
|
)
|
|
|
|
# Test sentence
|
|
test_sentence = "Hello everyone! I'm speaking with different emotion today. How does it sound to you?"
|
|
|
|
def create_emotion_reference(emotion):
|
|
"""Create emotion reference audio"""
|
|
ref_file = os.path.join(WORKSPACE, f"reference_{emotion}.wav")
|
|
|
|
# Emotion-specific reference texts
|
|
emotion_texts = {
|
|
"happy": "Wow! I'm so excited and happy today! Everything is going great! I can't believe how wonderful this day is!",
|
|
"sad": "I'm feeling very sad and lonely today. Nothing seems to be going right. Everything feels so overwhelming.",
|
|
"angry": "I'm really angry and frustrated! This is completely unacceptable! I can't believe what just happened!",
|
|
"calm": "I'm feeling very calm and peaceful today. Everything is quiet and serene. I feel so relaxed and at ease."
|
|
}
|
|
|
|
ref_text = emotion_texts.get(emotion)
|
|
|
|
if not os.path.exists(ref_file):
|
|
print(f"🎙️ Creating {emotion} emotion reference...")
|
|
print(f"Reference text: {ref_text[:50]}...")
|
|
|
|
# Generate reference audio with emotion
|
|
audio = model.generate(
|
|
text=ref_text,
|
|
cfg_value=2.5,
|
|
inference_timesteps=20,
|
|
normalize=True
|
|
)
|
|
|
|
sf.write(ref_file, audio, model.tts_model.sample_rate)
|
|
print(f"✅ Created {emotion} reference: {ref_file}")
|
|
|
|
return ref_file, ref_text
|
|
|
|
def test_emotion(emotion):
|
|
"""Test emotion generation"""
|
|
ref_audio, ref_text = create_emotion_reference(emotion)
|
|
|
|
output_file = os.path.join(OUTPUT_DIR, f"{emotion}_emotion_test.wav")
|
|
print(f"\n😊 Testing {emotion} emotion...")
|
|
print(f"Test sentence: {test_sentence}")
|
|
|
|
try:
|
|
# Generate audio with emotion
|
|
audio = model.generate(
|
|
text=test_sentence,
|
|
prompt_wav_path=ref_audio,
|
|
prompt_text=ref_text,
|
|
cfg_value=2.0,
|
|
inference_timesteps=20,
|
|
normalize=True,
|
|
retry_badcase=True
|
|
)
|
|
|
|
# Save audio
|
|
sf.write(output_file, audio, model.tts_model.sample_rate)
|
|
|
|
if os.path.exists(output_file):
|
|
duration = len(audio) / model.tts_model.sample_rate
|
|
print(f"✅ Generated {emotion} emotion: {output_file}")
|
|
print(f" Duration: {duration:.2f} seconds")
|
|
else:
|
|
print(f"❌ Failed to save")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
print(f"{'='*70}")
|
|
print(f"FIXED EMOTION EXPRESSION TEST")
|
|
print(f"{'='*70}")
|
|
|
|
emotions = ["happy", "sad", "angry", "calm"]
|
|
for emotion in emotions:
|
|
test_emotion(emotion)
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"EMOTION TEST COMPLETE")
|
|
print(f"{'='*70}")
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
print(f"\n📋 Generated emotion files:")
|
|
for emotion in emotions:
|
|
print(f" - {emotion}_emotion_test.wav")
|
|
print(f"\n🎧 Please listen to the files to verify emotion differences!")
|