Initial commit for TTS project

2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions
--- a/scripts/generate/generate_judy_ben_chapter8.py
+++ b/scripts/generate/generate_judy_ben_chapter8.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+Judy and Ben Chapter 8 Introduction Conversation
+Using VoxCPM voice cloning
+"""
+import os
+import sys
+import soundfile as sf
+import numpy as np
+
+# Paths
+WORKSPACE = "/root/tts"
+JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
+BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
+OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "chapter8_judy_ben")
+VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
+
+# Ensure directories exist
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+print(f"✅ Output directory: {OUTPUT_DIR}")
+
+# Check reference audio files
+if not os.path.exists(JUDY_REF):
+    print(f"❌ Judy reference audio not found: {JUDY_REF}")
+    sys.exit(1)
+print(f"✅ Judy reference audio: {JUDY_REF}")
+
+if not os.path.exists(BEN_REF):
+    print(f"❌ Ben reference audio not found: {BEN_REF}")
+    sys.exit(1)
+print(f"✅ Ben reference audio: {BEN_REF}")
+
+# Add VoxCPM to path
+sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
+print(f"✅ Added VoxCPM path")
+
+# Import VoxCPM
+try:
+    from voxcpm.core import VoxCPM
+    print(f"✅ VoxCPM imported successfully")
+except Exception as e:
+    print(f"❌ Failed to import VoxCPM: {e}")
+    sys.exit(1)
+
+# Model path
+LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
+if not os.path.exists(LOCAL_MODEL_PATH):
+    LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
+    if not os.path.exists(LOCAL_MODEL_PATH):
+        print(f"❌ Model path not found")
+        sys.exit(1)
+print(f"✅ Model path: {LOCAL_MODEL_PATH}")
+
+# Initialize VoxCPM
+print(f"\n🚀 Initializing VoxCPM...")
+try:
+    model = VoxCPM(
+        voxcpm_model_path=LOCAL_MODEL_PATH,
+        enable_denoiser=False,
+        optimize=False
+    )
+    print(f"✅ VoxCPM initialized successfully")
+except Exception as e:
+    print(f"❌ VoxCPM initialization failed: {e}")
+    sys.exit(1)
+
+# Text preprocessing function
+def preprocess_text(text):
+    """Process text for better pronunciation"""
+    text = text.replace("2008", "two thousand and eight")
+    text = text.replace("2009", "two thousand and nine")
+    text = text.replace("1-3%", "one to three percent")
+    text = text.replace("100", "one hundred")
+    text = text.replace("40", "forty")
+    text = text.replace("MBS", "M B S")
+    text = text.replace("CDO", "C D O")
+    text = text.replace("AAA", "triple A")
+    text = text.replace("Gaussian Copula", "Gaussian Copula")
+    text = text.replace("ChiNext", "Chi Next")
+    text = text.replace("GEM", "G E M")
+    return text
+
+# Reference texts for voice cloning
+REFERENCE_TEXTS = {
+    "judy": "题西林壁，横看成岭侧成峰，远近高低各不同。不识庐山真面目，只缘身在此山中。",
+    "ben": "白日依山尽，黄河入海流。欲穷千里目，更上一层楼。"
+}
+
+# Conversation content
+CONVERSATION = [
+    {
+        "speaker": "judy",
+        "text": "Ben, I've been reading Chapter 8 of your book, and I have to say—it's like a movie! The way you connect the financial crisis with tax codes, Gaussian functions, and even a Hong Kong pop star losing money is brilliant. How did you come up with this narrative?",
+        "filename": "judy_start.wav"
+    },
+    {
+        "speaker": "ben",
+        "text": "Thanks, Judy. It sounds like a script, right? But it's all true. The key insight is about property taxes. In America, homeowners are essentially tenants of the state because they pay one to three percent tax every year. In China back then, no property tax—you buy it, lock it up, and forget about it. That simple difference saved China from the subprime crisis.",
+        "filename": "ben_tax_explained.wav"
+    },
+    {
+        "speaker": "judy",
+        "text": "Wait, that's fascinating! So American homeowners had to create cash flow from their properties, which led to those complex derivatives. But then you mention David Li and his Gaussian Copula formula. How did that formula trick people like Jacky Cheung?",
+        "filename": "judy_ask_about_formula.wav"
+    },
+    {
+        "speaker": "ben",
+        "text": "Ah, the Gaussian Copula! It's a mathematical magic trick. David Li, a Chinese mathematician, created this formula that deleted the correlation between defaults. It told investors, 'Don't worry, if John defaults, Mary won't.' It turned junk loans into triple A rated securities. That's how Jacky Cheung got trapped—he bought Lehman Minibonds rated triple A because of this formula, and lost around forty million Hong Kong dollars!",
+        "filename": "ben_explain_formula.wav"
+    },
+    {
+        "speaker": "judy",
+        "text": "Forty million? That's incredible! And then the twist—China launching ChiNext during the financial crisis. That seems counterintuitive. Why did they do that?",
+        "filename": "judy_ask_about_chinext.wav"
+    },
+    {
+        "speaker": "ben",
+        "text": "Exactly! While Wall Street was melting down and Jacky was crying over his losses, Beijing looked at the rubble and realized: 'Making shirts and toys is dead. We need our own Google, our own Apple.' So in two thousand and nine, right in the middle of the financial tsunami, they launched ChiNext. It was a desperate pivot from being the World's Factory to becoming a Tech Powerhouse. That crisis forced China to change lanes.",
+        "filename": "ben_explain_chinext.wav"
+    },
+    {
+        "speaker": "judy",
+        "text": "Wow, that's such a powerful narrative. The contrast between the American financial system melting down because of complexity, and China pivoting to innovation is really striking. Let's dive deeper into Chapter 8 and explore how this all played out.",
+        "filename": "judy_conclude.wav"
+    }
+]
+
+# Generate cloned voices
+print(f"\n{'='*70}")
+print(f"GENERATING JUDY & BEN CONVERSATION")
+print(f"{'='*70}")
+
+# Initialize model
+model = VoxCPM(
+    voxcpm_model_path=LOCAL_MODEL_PATH,
+    enable_denoiser=False,
+    optimize=False
+)
+
+for line in CONVERSATION:
+    speaker = line["speaker"]
+    text = line["text"]
+    filename = line["filename"]
+    
+    print(f"\n🎙️ Generating {speaker}'s line: {filename}")
+    print(f"Text: {text[:50]}...")
+    
+    # Preprocess text
+    processed_text = preprocess_text(text)
+    
+    # Get reference audio and text
+    if speaker == "judy":
+        ref_audio = JUDY_REF
+        ref_text = REFERENCE_TEXTS["judy"]
+    else:  # ben
+        ref_audio = BEN_REF
+        ref_text = REFERENCE_TEXTS["ben"]
+    
+    try:
+        # Generate audio
+        audio = model.generate(
+            text=processed_text,
+            prompt_wav_path=ref_audio,
+            prompt_text=ref_text,
+            cfg_value=2.0,
+            inference_timesteps=20,
+            normalize=True,
+            denoise=False,
+            retry_badcase=True
+        )
+        
+        # Save audio
+        output_file = os.path.join(OUTPUT_DIR, filename)
+        sf.write(output_file, audio, model.tts_model.sample_rate)
+        
+        # Verify
+        if os.path.exists(output_file):
+            file_size = os.path.getsize(output_file)
+            duration = len(audio) / model.tts_model.sample_rate
+            print(f"✅ Generated successfully!")
+            print(f"   File: {output_file}")
+            print(f"   Size: {file_size} bytes")
+            print(f"   Duration: {duration:.2f} seconds")
+        else:
+            print(f"❌ Failed to save")
+            
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+# Summary
+print(f"\n{'='*70}")
+print(f"CONVERSATION GENERATION COMPLETE")
+print(f"{'='*70}")
+print(f"Output directory: {OUTPUT_DIR}")
+print(f"\nGenerated files:")
+for line in CONVERSATION:
+    output_file = os.path.join(OUTPUT_DIR, line["filename"])
+    if os.path.exists(output_file):
+        size = os.path.getsize(output_file)
+        print(f"   - {line['filename']} ({size} bytes)")
+    else:
+        print(f"   - {line['filename']} (FAILED)")
+print(f"\n{'='*70}")