tts/scripts/generate/generate_author_interview.py

#!/usr/bin/env python3
"""
Author Interview Podcast Generator - Chapter 8
- Author uses VoxCPM for voice
- Other guests use Edge TTS
- All content in English
"""
import os
import sys
import subprocess
import time
from datetime import datetime

# Paths
WORKSPACE = "/root/tts"
OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "chapter8_author_interview")
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")

# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory created: {OUTPUT_DIR}")

# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")

# Import VoxCPM for author voice
try:
    from voxcpm.core import VoxCPM
    print(f"✅ VoxCPM imported successfully")
except Exception as e:
    print(f"❌ Failed to import VoxCPM: {e}")
    sys.exit(1)

# Model path
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
    if not os.path.exists(LOCAL_MODEL_PATH):
        print(f"❌ Model path not found")
        sys.exit(1)
print(f"✅ Model path: {LOCAL_MODEL_PATH}")

# Initialize VoxCPM for author
print(f"\n🚀 Initializing VoxCPM for author voice...")
try:
    author_voice = VoxCPM(
        voxcpm_model_path=LOCAL_MODEL_PATH,
        enable_denoiser=False,
        optimize=False
    )
    print(f"✅ VoxCPM initialized successfully")
except Exception as e:
    print(f"❌ VoxCPM initialization failed: {e}")
    sys.exit(1)

# Edge TTS voices for guests
EDGE_TTS_VOICES = {
    "graham": "en-US-GuyNeural",       # American male for tech bro
    "dmitri": "ru-RU-DmitryNeural",      # Russian male for Dmitri
    "amita": "en-US-AriaNeural",        # American female as fallback for Amita
    "mohammed": "ar-SA-HamedNeural"      # Arabic male for Mohammed
}

# Interview content in English
INTERVIEW_CONTENT = {
    "author": {
        "intro": {
            "text": "Welcome to the chapter 8 interview. Today we're discussing how China used patience to get its entry ticket to the world factory between 2001 and 2009. The core metaphor is Han Xin's胯下 humiliation - enduring temporary shame for long-term success.",
            "filename": "author_intro.wav"
        },
        "response_1": {
            "text": "Great question, Graham. The technical gap was indeed significant. But China understood that modern warfare is about endurance, not just firepower. While America was fighting the War on Terror, China was building its industrial base. This strategic patience is what allowed them to become the world's factory.",
            "filename": "author_response_1.wav"
        },
        "response_2": {
            "text": "Dmitri makes an excellent point about energy. Russia's natural gas was crucial for China's 24-hour production lines. This was a mutually beneficial strategic cooperation - Russia provided the energy, China provided the market. It's a perfect example of how geopolitical interests can create unexpected alliances.",
            "filename": "author_response_2.wav"
        }
    },
    "guests": {
        "graham": {
            "question": {
                "text": "Wait, host. I think you're missing a key variable - the technological gap. In the 2003 Iraq War, the US overthrew Saddam in just 42 days. In 2001 Afghanistan, precision-guided bombs destroyed all Taliban strongholds. This shows war has changed. Why are you still using Cold War thinking to analyze geopolitics?",
                "filename": "graham_question.wav"
            }
        },
        "dmitri": {
            "question": {
                "text": "Host, I agree technology is important, but let me add - energy is the ultimate ace. In 2006, when natural gas prices rose, how did Europeans tremble? China became the world's factory precisely because of Russia's energy support. Siberian gas pipelines are the real entry ticket. Without Russian energy, how could China operate 24/7?",
                "filename": "dmitri_question.wav"
            }
        },
        "amita": {
            "question": {
                "text": "Wait, both of you. The world factory you're talking about seems to assume the 'China model' is the only one. But let me remind you - after 2008, Bangalore is rising. India's software outsourcing, Mexico's nearshoring, Vietnam's assembly lines... There's more than one world factory. Why do you only talk about China?",
                "filename": "amita_question.wav"
            }
        },
        "mohammed": {
            "question": {
                "text": "You all make good points, but I want to ask a more fundamental question - is the concept of 'world factory' itself a trap? What did China get for its 70% foreign trade dependence? It got US aircraft carriers that can cut off the Malacca Strait at any time. It got the risk of putting all eggs in one basket. Host, you call this an 'entry ticket'? I think it's more like an invitation to a trap.",
                "filename": "mohammed_question.wav"
            }
        }
    }
}

# Function to generate author voice with VoxCPM
def generate_author_voice(text, filename):
    """Generate author voice using VoxCPM"""
    output_file = os.path.join(OUTPUT_DIR, filename)
    print(f"\n🎙️ Generating author voice for: {filename}")
    print(f"Text: {text[:50]}...")

    try:
        audio = author_voice.generate(
            text=text,
            prompt_wav_path=None,
            prompt_text=None,
            cfg_value=2.0,
            inference_timesteps=20,
            normalize=True,
            denoise=False,
            retry_badcase=True
        )

        import soundfile as sf
        sf.write(output_file, audio, author_voice.tts_model.sample_rate)

        if os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            duration = len(audio) / author_voice.tts_model.sample_rate
            print(f"✅ Author voice generated successfully!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            print(f"   Duration: {duration:.2f} seconds")
            return True
        else:
            print(f"❌ Failed to save author voice")
            return False

    except Exception as e:
        print(f"❌ Error generating author voice: {e}")
        import traceback
        traceback.print_exc()
        return False

# Function to generate guest voice with Edge TTS
def generate_guest_voice(guest_id, text, filename):
    """Generate guest voice using Edge TTS"""
    output_file = os.path.join(OUTPUT_DIR, filename)
    voice = EDGE_TTS_VOICES.get(guest_id)

    if not voice:
        print(f"❌ No voice found for guest: {guest_id}")
        return False

    print(f"\n🎙️ Generating {guest_id} voice with Edge TTS: {filename}")
    print(f"Voice: {voice}")
    print(f"Text: {text[:50]}...")

    try:
        # Use edge-tts command
        command = [
            "edge-tts",
            "--voice", voice,
            "--text", text,
            "--write-media", output_file
        ]

        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            cwd=WORKSPACE
        )

        if result.returncode == 0 and os.path.exists(output_file):
            file_size = os.path.getsize(output_file)
            print(f"✅ Guest voice generated successfully!")
            print(f"   File: {output_file}")
            print(f"   Size: {file_size} bytes")
            return True
        else:
            print(f"❌ Failed to generate guest voice")
            print(f"   Error: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Error generating guest voice: {e}")
        import traceback
        traceback.print_exc()
        return False

# Main generation process
print(f"\n{'='*70}")
print(f"STARTING AUTHOR INTERVIEW PODCAST GENERATION")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*70}")

# Generate author voice (using VoxCPM)
print(f"\n{'='*50}")
print(f"GENERATING AUTHOR VOICE (VoxCPM)")
print(f"{'='*50}")

for key, content in INTERVIEW_CONTENT["author"].items():
    generate_author_voice(content["text"], content["filename"])

# Generate guest voices (using Edge TTS)
print(f"\n{'='*50}")
print(f"GENERATING GUEST VOICES (Edge TTS)")
print(f"{'='*50}")

for guest_id, guest_content in INTERVIEW_CONTENT["guests"].items():
    for key, content in guest_content.items():
        generate_guest_voice(guest_id, content["text"], content["filename"])

# Verify all files
print(f"\n{'='*70}")
print(f"VERIFICATION: GENERATED FILES")
print(f"{'='*70}")

all_files = []
for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            file_size = os.path.getsize(file_path)
            all_files.append((file, file_size))

if all_files:
    print(f"✅ Generated {len(all_files)} files:")
    for file, size in all_files:
        print(f"   📄 {file} ({size} bytes)")
else:
    print(f"❌ No files generated!")

print(f"\n{'='*70}")
print(f"PODCAST GENERATION COMPLETE")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*70}")