tts/scripts/test_emotion_fixed.py

#!/usr/bin/env python3
"""
Fixed emotion test for VoxCPM
Using proper parameter format
"""

import os
import sys
import soundfile as sf
import numpy as np

# Paths
WORKSPACE = "/root/tts"
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_verification")

# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))

# Import VoxCPM
try:
    from voxcpm.core import VoxCPM
except Exception as e:
    print(f"❌ Failed to import VoxCPM: {e}")
    sys.exit(1)

# Use LOCAL model
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f"❌ Local model path not found")
        sys.exit(1)

# Initialize VoxCPM
model = VoxCPM(
    voxcpm_model_path=LOCAL_MODEL_PATH,
    enable_denoiser=False,
    optimize=False
)

# Test sentence
test_sentence = "Hello everyone! I'm speaking with different emotion today. How does it sound to you?"

def create_emotion_reference(emotion):
    """Create emotion reference audio"""
    ref_file = os.path.join(WORKSPACE, f"reference_{emotion}.wav")

    # Emotion-specific reference texts
    emotion_texts = {
        "happy": "Wow! I'm so excited and happy today! Everything is going great! I can't believe how wonderful this day is!",
        "sad": "I'm feeling very sad and lonely today. Nothing seems to be going right. Everything feels so overwhelming.",
        "angry": "I'm really angry and frustrated! This is completely unacceptable! I can't believe what just happened!",
        "calm": "I'm feeling very calm and peaceful today. Everything is quiet and serene. I feel so relaxed and at ease."
    }

    ref_text = emotion_texts.get(emotion)

    if not os.path.exists(ref_file):
        print(f"🎙️ Creating {emotion} emotion reference...")
        print(f"Reference text: {ref_text[:50]}...")

        # Generate reference audio with emotion
        audio = model.generate(
            text=ref_text,
            cfg_value=2.5,
            inference_timesteps=20,
            normalize=True
        )

        sf.write(ref_file, audio, model.tts_model.sample_rate)
        print(f"✅ Created {emotion} reference: {ref_file}")

    return ref_file, ref_text

def test_emotion(emotion):
    """Test emotion generation"""
    ref_audio, ref_text = create_emotion_reference(emotion)

    output_file = os.path.join(OUTPUT_DIR, f"{emotion}_emotion_test.wav")
    print(f"\n😊 Testing {emotion} emotion...")
    print(f"Test sentence: {test_sentence}")

    try:
        # Generate audio with emotion
        audio = model.generate(
            text=test_sentence,
            prompt_wav_path=ref_audio,
            prompt_text=ref_text,
            cfg_value=2.0,
            inference_timesteps=20,
            normalize=True,
            retry_badcase=True
        )

        # Save audio
        sf.write(output_file, audio, model.tts_model.sample_rate)

        if os.path.exists(output_file):
            duration = len(audio) / model.tts_model.sample_rate
            print(f"✅ Generated {emotion} emotion: {output_file}")
            print(f"   Duration: {duration:.2f} seconds")
        else:
            print(f"❌ Failed to save")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    print(f"{'='*70}")
    print(f"FIXED EMOTION EXPRESSION TEST")
    print(f"{'='*70}")

    emotions = ["happy", "sad", "angry", "calm"]
    for emotion in emotions:
        test_emotion(emotion)

    print(f"\n{'='*70}")
    print(f"EMOTION TEST COMPLETE")
    print(f"{'='*70}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"\n📋 Generated emotion files:")
    for emotion in emotions:
        print(f"   - {emotion}_emotion_test.wav")
    print(f"\n🎧 Please listen to the files to verify emotion differences!")