Initial commit for TTS project

This commit is contained in:
Ben
2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions

View File

@@ -0,0 +1,205 @@
#!/usr/bin/env python3
"""
Judy and Ben Chapter 8 Introduction Conversation
Using VoxCPM voice cloning
"""
import os
import sys
import soundfile as sf
import numpy as np
# Paths
WORKSPACE = "/root/tts"
JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
OUTPUT_DIR = os.path.join(WORKSPACE, "podcast_audios", "chapter8_judy_ben")
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")
# Check reference audio files
if not os.path.exists(JUDY_REF):
print(f"❌ Judy reference audio not found: {JUDY_REF}")
sys.exit(1)
print(f"✅ Judy reference audio: {JUDY_REF}")
if not os.path.exists(BEN_REF):
print(f"❌ Ben reference audio not found: {BEN_REF}")
sys.exit(1)
print(f"✅ Ben reference audio: {BEN_REF}")
# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")
# Import VoxCPM
try:
from voxcpm.core import VoxCPM
print(f"✅ VoxCPM imported successfully")
except Exception as e:
print(f"❌ Failed to import VoxCPM: {e}")
sys.exit(1)
# Model path
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
print(f"❌ Model path not found")
sys.exit(1)
print(f"✅ Model path: {LOCAL_MODEL_PATH}")
# Initialize VoxCPM
print(f"\n🚀 Initializing VoxCPM...")
try:
model = VoxCPM(
voxcpm_model_path=LOCAL_MODEL_PATH,
enable_denoiser=False,
optimize=False
)
print(f"✅ VoxCPM initialized successfully")
except Exception as e:
print(f"❌ VoxCPM initialization failed: {e}")
sys.exit(1)
# Text preprocessing function
def preprocess_text(text):
"""Process text for better pronunciation"""
text = text.replace("2008", "two thousand and eight")
text = text.replace("2009", "two thousand and nine")
text = text.replace("1-3%", "one to three percent")
text = text.replace("100", "one hundred")
text = text.replace("40", "forty")
text = text.replace("MBS", "M B S")
text = text.replace("CDO", "C D O")
text = text.replace("AAA", "triple A")
text = text.replace("Gaussian Copula", "Gaussian Copula")
text = text.replace("ChiNext", "Chi Next")
text = text.replace("GEM", "G E M")
return text
# Reference texts for voice cloning
REFERENCE_TEXTS = {
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。",
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。"
}
# Conversation content
CONVERSATION = [
{
"speaker": "judy",
"text": "Ben, I've been reading Chapter 8 of your book, and I have to say—it's like a movie! The way you connect the financial crisis with tax codes, Gaussian functions, and even a Hong Kong pop star losing money is brilliant. How did you come up with this narrative?",
"filename": "judy_start.wav"
},
{
"speaker": "ben",
"text": "Thanks, Judy. It sounds like a script, right? But it's all true. The key insight is about property taxes. In America, homeowners are essentially tenants of the state because they pay one to three percent tax every year. In China back then, no property tax—you buy it, lock it up, and forget about it. That simple difference saved China from the subprime crisis.",
"filename": "ben_tax_explained.wav"
},
{
"speaker": "judy",
"text": "Wait, that's fascinating! So American homeowners had to create cash flow from their properties, which led to those complex derivatives. But then you mention David Li and his Gaussian Copula formula. How did that formula trick people like Jacky Cheung?",
"filename": "judy_ask_about_formula.wav"
},
{
"speaker": "ben",
"text": "Ah, the Gaussian Copula! It's a mathematical magic trick. David Li, a Chinese mathematician, created this formula that deleted the correlation between defaults. It told investors, 'Don't worry, if John defaults, Mary won't.' It turned junk loans into triple A rated securities. That's how Jacky Cheung got trapped—he bought Lehman Minibonds rated triple A because of this formula, and lost around forty million Hong Kong dollars!",
"filename": "ben_explain_formula.wav"
},
{
"speaker": "judy",
"text": "Forty million? That's incredible! And then the twist—China launching ChiNext during the financial crisis. That seems counterintuitive. Why did they do that?",
"filename": "judy_ask_about_chinext.wav"
},
{
"speaker": "ben",
"text": "Exactly! While Wall Street was melting down and Jacky was crying over his losses, Beijing looked at the rubble and realized: 'Making shirts and toys is dead. We need our own Google, our own Apple.' So in two thousand and nine, right in the middle of the financial tsunami, they launched ChiNext. It was a desperate pivot from being the World's Factory to becoming a Tech Powerhouse. That crisis forced China to change lanes.",
"filename": "ben_explain_chinext.wav"
},
{
"speaker": "judy",
"text": "Wow, that's such a powerful narrative. The contrast between the American financial system melting down because of complexity, and China pivoting to innovation is really striking. Let's dive deeper into Chapter 8 and explore how this all played out.",
"filename": "judy_conclude.wav"
}
]
# Generate cloned voices
print(f"\n{'='*70}")
print(f"GENERATING JUDY & BEN CONVERSATION")
print(f"{'='*70}")
# Initialize model
model = VoxCPM(
voxcpm_model_path=LOCAL_MODEL_PATH,
enable_denoiser=False,
optimize=False
)
for line in CONVERSATION:
speaker = line["speaker"]
text = line["text"]
filename = line["filename"]
print(f"\n🎙️ Generating {speaker}'s line: {filename}")
print(f"Text: {text[:50]}...")
# Preprocess text
processed_text = preprocess_text(text)
# Get reference audio and text
if speaker == "judy":
ref_audio = JUDY_REF
ref_text = REFERENCE_TEXTS["judy"]
else: # ben
ref_audio = BEN_REF
ref_text = REFERENCE_TEXTS["ben"]
try:
# Generate audio
audio = model.generate(
text=processed_text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=2.0,
inference_timesteps=20,
normalize=True,
denoise=False,
retry_badcase=True
)
# Save audio
output_file = os.path.join(OUTPUT_DIR, filename)
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Generated successfully!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
# Summary
print(f"\n{'='*70}")
print(f"CONVERSATION GENERATION COMPLETE")
print(f"{'='*70}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nGenerated files:")
for line in CONVERSATION:
output_file = os.path.join(OUTPUT_DIR, line["filename"])
if os.path.exists(output_file):
size = os.path.getsize(output_file)
print(f" - {line['filename']} ({size} bytes)")
else:
print(f" - {line['filename']} (FAILED)")
print(f"\n{'='*70}")