tts/scripts/test_accent_verification.py

#!/usr/bin/env python3
"""
Accent verification test for VoxCPM
Using different reference audios for different accents
"""

import os
import sys
import soundfile as sf
import numpy as np

# Paths
WORKSPACE = "/root/tts"
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_verification")

# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")

# Import VoxCPM
try:
    from voxcpm.core import VoxCPM
    print(f"✅ VoxCPM imported successfully")
except Exception as e:
    print(f"❌ Failed to import VoxCPM: {e}")
    sys.exit(1)

# Use LOCAL model
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f"❌ Local model path not found")
        sys.exit(1)
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")

# Initialize VoxCPM
print(f"\n🚀 Initializing VoxCPM...")
try:
    model = VoxCPM(
        voxcpm_model_path=LOCAL_MODEL_PATH,
        enable_denoiser=False,
        optimize=False
    )
    print(f"✅ VoxCPM initialized successfully")
except Exception as e:
    print(f"❌ VoxCPM initialization failed: {e}")
    sys.exit(1)

# Test sentence
test_sentence = "Hello everyone! I'm speaking with a different accent today. How does it sound to you?"

# Create accent-specific reference audios
def create_accent_reference(accent_name, description):
    """Create reference audio for specific accent"""
    ref_file = os.path.join(WORKSPACE, f"reference_{accent_name}.wav")

    # Create accent-specific reference text
    ref_texts = {
        "indian": "Namaste! How are you doing today? I'm from India. The weather here is quite warm and sunny.",
        "british": "Hello there! How are you today? I'm from London. The weather here is quite rainy and cold.",
        "american": "Hey! What's up? I'm from New York. The weather here is pretty nice today.",
        "australian": "G'day mate! How ya goin'? I'm from Sydney. The weather here is bloody fantastic!",
        "russian": "Privet! Kak dela? I'm from Moscow. The weather here is very cold with snow.",
        "singaporean": "Hi there! How's it going? I'm from Singapore. We have delicious hawker food here.",
        "hongkong": "Nei ho! How are you? I'm from Hong Kong. It's a busy city with great food."
    }

    ref_text = ref_texts.get(accent_name, ref_texts["american"])

    if not os.path.exists(ref_file):
        print(f"🎙️ Creating {accent_name} accent reference...")
        print(f"Reference text: {ref_text}")

        # Generate reference audio with distinct characteristics
        audio = model.generate(
            text=ref_text,
            cfg_value=2.5,
            inference_timesteps=20,
            normalize=True
        )

        sf.write(ref_file, audio, model.tts_model.sample_rate)
        print(f"✅ Created {accent_name} reference: {ref_file}")

    return ref_file, ref_text

# Test different accents
def test_accent(accent_name, description):
    """Test accent generation"""
    ref_audio, ref_text = create_accent_reference(accent_name, description)

    output_file = os.path.join(OUTPUT_DIR, f"{accent_name}_accent_test.wav")
    print(f"\n🎯 Testing {accent_name} accent...")
    print(f"Test sentence: {test_sentence}")

    try:
        # Generate audio with accent
        audio = model.generate(
            text=test_sentence,
            prompt_wav_path=ref_audio,
            prompt_text=ref_text,
            cfg_value=2.0,
            inference_timesteps=20,
            normalize=True,
            retry_badcase=True
        )

        # Save audio
        sf.write(output_file, audio, model.tts_model.sample_rate)

        # Verify
        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            duration = len(audio) / model.tts_model.sample_rate
            print(f"✅ Generated successfully!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            print(f"   Duration: {duration:.2f} seconds")
        else:
            print(f"❌ Failed to save")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

# Test emotion capability
def test_emotion():
    """Test emotion expression capability"""
    emotions = {
        "happy": "Wow! I'm so excited and happy today! Everything is going great!",
        "sad": "I'm feeling very sad and lonely today. Nothing seems to be going right.",
        "angry": "I'm really angry and frustrated! This is completely unacceptable!",
        "calm": "I'm feeling very calm and peaceful today. Everything is quiet and serene."
    }

    for emotion, ref_text in emotions.items():
        output_file = os.path.join(OUTPUT_DIR, f"{emotion}_emotion_test.wav")
        print(f"\n😊 Testing {emotion} emotion...")

        try:
            # Generate audio with emotion
            audio = model.generate(
                text=test_sentence,
                prompt_wav_path=None,  # Let model infer emotion from text
                prompt_text=ref_text,
                cfg_value=2.5,
                inference_timesteps=20,
                normalize=True
            )

            # Save audio
            sf.write(output_file, audio, model.tts_model.sample_rate)

            if os.path.exists(output_file):
                duration = len(audio) / model.tts_model.sample_rate
                print(f"✅ Generated {emotion} emotion: {output_file}")
                print(f"   Duration: {duration:.2f} seconds")
            else:
                print(f"❌ Failed to save")

        except Exception as e:
            print(f"❌ Error: {e}")

if __name__ == "__main__":
    print(f"{'='*70}")
    print(f"VOXCPM ACCENT AND EMOTION VERIFICATION TEST")
    print(f"{'='*70}")

    # Test different accents
    accents = [
        ("indian", "Indian English accent"),
        ("british", "British English accent"),
        ("american", "American English accent"),
        ("australian", "Australian English accent"),
        ("russian", "Russian English accent"),
        ("singaporean", "Singaporean English accent"),
        ("hongkong", "Hong Kong English accent")
    ]

    for accent_name, description in accents:
        test_accent(accent_name, description)

    # Test emotion capability
    print(f"\n{'='*70}")
    print(f"TESTING EMOTION EXPRESSION CAPABILITY")
    print(f"{'='*70}")
    test_emotion()

    print(f"\n{'='*70}")
    print(f"VERIFICATION TEST COMPLETE")
    print(f"{'='*70}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"\n📋 Generated files:")
    for accent_name, _ in accents:
        print(f"   - {accent_name}_accent_test.wav")
    for emotion in ["happy", "sad", "angry", "calm"]:
        print(f"   - {emotion}_emotion_test.wav")
    print(f"\n🎧 Please listen to the files to verify accent and emotion differences!")