Initial commit for TTS project
This commit is contained in:
119
scripts/generation/generate_accent_demo.py
Normal file
119
scripts/generation/generate_accent_demo.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate accent demos using VoxCPM
|
||||
Supports: Indian, Russian, Singaporean, Hong Kong English accents
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from voxcpm import VoxCPM
|
||||
|
||||
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos"):
|
||||
"""Generate accent demo audio"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Define reference audio paths (will be created if not exist)
|
||||
ref_audio_map = {
|
||||
"indian": "reference_indian.wav",
|
||||
"russian": "reference_russian.wav",
|
||||
"singaporean": "reference_singaporean.wav",
|
||||
"hongkong": "reference_hongkong.wav"
|
||||
}
|
||||
|
||||
# Define reference texts that demonstrate accent characteristics
|
||||
ref_text_map = {
|
||||
"indian": "Hello, how are you doing today? I'm from Mumbai, India. The weather here is quite warm and humid during the summer months. Would you like to try some delicious Indian cuisine with me?",
|
||||
"russian": "Hello, how are you doing today? I'm from Moscow, Russia. The winters here are very cold, with lots of snow and ice. But the summers are beautiful and sunny. Would you like to visit the Red Square with me?",
|
||||
"singaporean": "Hello, how are you doing today? I'm from Singapore. It's a small but vibrant city-state in Southeast Asia. We have delicious hawker food and beautiful gardens. Would you like to try some chicken rice with me?",
|
||||
"hongkong": "Hello, how are you doing today? I'm from Hong Kong. It's a bustling metropolitan city with amazing skyline and delicious food. We have dim sum, roast goose, and many other Cantonese delicacies. Would you like to go shopping in Causeway Bay with me?"
|
||||
}
|
||||
|
||||
ref_audio = ref_audio_map.get(accent_name)
|
||||
ref_text = ref_text_map.get(accent_name)
|
||||
|
||||
if not ref_audio or not ref_text:
|
||||
print(f"Invalid accent name: {accent_name}")
|
||||
return
|
||||
|
||||
# Check if reference audio exists (if not, we'll generate it using default voice)
|
||||
if not os.path.exists(ref_audio):
|
||||
print(f"Reference audio not found for {accent_name}, generating with default voice...")
|
||||
# Generate reference audio using default voice
|
||||
audio = model.generate(
|
||||
text=ref_text,
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20
|
||||
)
|
||||
sf.write(ref_audio, audio, 24000)
|
||||
print(f"Generated reference audio: {ref_audio}")
|
||||
|
||||
# Generate accent demo
|
||||
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
||||
print(f"Generating {accent_name} accent demo...")
|
||||
|
||||
audio = model.generate(
|
||||
text=text,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20
|
||||
)
|
||||
|
||||
sf.write(output_file, audio, 24000)
|
||||
print(f"Generated {accent_name} accent demo: {output_file}")
|
||||
return output_file
|
||||
|
||||
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos"):
|
||||
"""Generate Cantonese pinyin demo"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Generate reference audio for Cantonese accent
|
||||
ref_audio = "reference_cantonese.wav"
|
||||
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌和表演。希望你喜欢我的音乐。"
|
||||
|
||||
if not os.path.exists(ref_audio):
|
||||
print("Generating Cantonese reference audio...")
|
||||
audio = model.generate(
|
||||
text=ref_text,
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20
|
||||
)
|
||||
sf.write(ref_audio, audio, 24000)
|
||||
print(f"Generated Cantonese reference audio: {ref_audio}")
|
||||
|
||||
# Generate Cantonese pinyin demo
|
||||
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
||||
print("Generating Cantonese pinyin demo...")
|
||||
|
||||
audio = model.generate(
|
||||
text=pinyin,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20
|
||||
)
|
||||
|
||||
sf.write(output_file, audio, 24000)
|
||||
print(f"Generated Cantonese pinyin demo: {output_file}")
|
||||
return output_file
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Initialize VoxCPM
|
||||
print("Initializing VoxCPM...")
|
||||
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
|
||||
|
||||
# Test sentence
|
||||
test_text = "Hello everyone, welcome to our podcast. Today we're going to discuss various accents from around the world. I hope you enjoy this episode!"
|
||||
|
||||
# Generate accent demos
|
||||
accents = ["indian", "russian", "singaporean", "hongkong"]
|
||||
for accent in accents:
|
||||
generate_accent_demo(model, test_text, accent)
|
||||
|
||||
# Generate Cantonese pinyin demo (Jacky Cheung)
|
||||
cantonese_text = "张学友是香港著名歌手,被誉为歌神。他的歌声深情动人,深受歌迷喜爱。"
|
||||
cantonese_pinyin = "{zoeng1}{hau2}{juk6} {si6} {hoeng1}{gong2} {zyu4}{ming4} {go1}{sau2},{bei6}{jyu6} {go1}{san4}。{taa1} {dik1} {go1}{sing1} {sam1}{cing4} {dung6}{jan4},{sam1}{sau6} {go1}{mai4} {hei2}{oi3}。"
|
||||
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
|
||||
|
||||
print("All demos generated successfully!")
|
||||
167
scripts/generation/generate_accent_demo_local.py
Normal file
167
scripts/generation/generate_accent_demo_local.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Accent demo generator using LOCAL VoxCPM model
|
||||
Using the same successful parameters as the Ben voice cloning
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
|
||||
# Paths
|
||||
WORKSPACE = "/root/tts"
|
||||
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
|
||||
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
|
||||
|
||||
# Add VoxCPM to path
|
||||
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
|
||||
print(f"✅ Added VoxCPM path")
|
||||
|
||||
# Import VoxCPM
|
||||
try:
|
||||
from voxcpm.core import VoxCPM
|
||||
print(f"✅ VoxCPM imported successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to import VoxCPM: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Use LOCAL model (same as successful Ben voice cloning)
|
||||
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
|
||||
if not os.path.exists(LOCAL_MODEL_PATH):
|
||||
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
|
||||
if not os.path.exists(LOCAL_MODEL_PATH):
|
||||
print(f"❌ Local model path not found")
|
||||
sys.exit(1)
|
||||
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
print(f"✅ Output directory: {OUTPUT_DIR}")
|
||||
|
||||
# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
|
||||
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
|
||||
try:
|
||||
model = VoxCPM(
|
||||
voxcpm_model_path=LOCAL_MODEL_PATH,
|
||||
enable_denoiser=False, # Disable denoiser for better quality
|
||||
optimize=False # Disable optimization to avoid issues
|
||||
)
|
||||
print(f"✅ VoxCPM initialized successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ VoxCPM initialization failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Use REAL reference audio files (the ones that worked for Ben)
|
||||
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
|
||||
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
|
||||
|
||||
print(f"✅ Ben reference audio: {REAL_BEN_REF}")
|
||||
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
|
||||
|
||||
# Reference texts that MATCH the audio
|
||||
REFERENCE_TEXTS = {
|
||||
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
|
||||
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。"
|
||||
}
|
||||
|
||||
def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
|
||||
"""Generate accent demo using REAL reference audio"""
|
||||
|
||||
# Use Ben's reference audio as base (since it worked well)
|
||||
ref_audio = REAL_BEN_REF
|
||||
ref_text = REFERENCE_TEXTS["ben"]
|
||||
|
||||
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
||||
print(f"\n🎙️ Generating {accent_name} accent demo...")
|
||||
print(f"Text: {text[:50]}...")
|
||||
|
||||
try:
|
||||
# Generate audio with the SAME parameters as successful Ben voice cloning
|
||||
audio = model.generate(
|
||||
text=text,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=2.0, # Same as successful Ben
|
||||
inference_timesteps=20, # Same as successful Ben
|
||||
normalize=True, # Enable text normalization
|
||||
denoise=False, # Disable denoise
|
||||
retry_badcase=True # Enable retry for bad cases
|
||||
)
|
||||
|
||||
# Save audio
|
||||
sf.write(output_file, audio, model.tts_model.sample_rate)
|
||||
|
||||
# Verify
|
||||
if os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
duration = len(audio) / model.tts_model.sample_rate
|
||||
print(f"✅ Generated successfully!")
|
||||
print(f" File: {output_file}")
|
||||
print(f" Size: {file_size} bytes")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
else:
|
||||
print(f"❌ Failed to save")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
|
||||
"""Generate Cantonese pinyin demo"""
|
||||
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
||||
print(f"\n🎙️ Generating Cantonese pinyin demo...")
|
||||
print(f"Text: {text[:50]}...")
|
||||
|
||||
try:
|
||||
# Generate audio with the SAME parameters
|
||||
audio = model.generate(
|
||||
text=pinyin,
|
||||
prompt_wav_path=REAL_BEN_REF, # Use Ben's reference
|
||||
prompt_text=REFERENCE_TEXTS["ben"],
|
||||
cfg_value=2.0,
|
||||
inference_timesteps=20,
|
||||
normalize=True,
|
||||
denoise=False,
|
||||
retry_badcase=True
|
||||
)
|
||||
|
||||
# Save audio
|
||||
sf.write(output_file, audio, model.tts_model.sample_rate)
|
||||
|
||||
# Verify
|
||||
if os.path.exists(output_file):
|
||||
file_size = os.path.getsize(output_file)
|
||||
duration = len(audio) / model.tts_model.sample_rate
|
||||
print(f"✅ Generated successfully!")
|
||||
print(f" File: {output_file}")
|
||||
print(f" Size: {file_size} bytes")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
else:
|
||||
print(f"❌ Failed to save")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test sentence (same as before)
|
||||
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
|
||||
|
||||
# Generate accent demos using REAL reference audio
|
||||
accents = ["indian", "russian", "singaporean", "hongkong"]
|
||||
for accent in accents:
|
||||
generate_accent_demo_with_real_reference(test_text, accent)
|
||||
|
||||
# Generate Cantonese pinyin demo
|
||||
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
|
||||
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
|
||||
generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"ACCENT DEMOS GENERATION COMPLETE")
|
||||
print(f"{'='*70}")
|
||||
print(f"Output directory: {OUTPUT_DIR}")
|
||||
print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")
|
||||
118
scripts/generation/generate_accent_demo_optimized.py
Normal file
118
scripts/generation/generate_accent_demo_optimized.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimized accent demo generator using VoxCPM
|
||||
Improved version with better parameters and shorter text
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from voxcpm import VoxCPM
|
||||
|
||||
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos_optimized"):
|
||||
"""Generate optimized accent demo audio"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Define reference audio paths
|
||||
ref_audio_map = {
|
||||
"indian": "reference_indian_opt.wav",
|
||||
"russian": "reference_russian_opt.wav",
|
||||
"singaporean": "reference_singaporean_opt.wav",
|
||||
"hongkong": "reference_hongkong_opt.wav"
|
||||
}
|
||||
|
||||
# Define better reference texts (shorter, more natural)
|
||||
ref_text_map = {
|
||||
"indian": "Hello there! How are you today? I'm from India. The weather here is quite warm.",
|
||||
"russian": "Hello! How are you doing? I'm from Russia. The winters here are very cold.",
|
||||
"singaporean": "Hi! How's it going? I'm from Singapore. We have delicious hawker food here.",
|
||||
"hongkong": "Hey! How are you? I'm from Hong Kong. It's a bustling city with amazing food."
|
||||
}
|
||||
|
||||
ref_audio = ref_audio_map.get(accent_name)
|
||||
ref_text = ref_text_map.get(accent_name)
|
||||
|
||||
if not ref_audio or not ref_text:
|
||||
print(f"Invalid accent name: {accent_name}")
|
||||
return
|
||||
|
||||
# Generate high-quality reference audio
|
||||
if not os.path.exists(ref_audio):
|
||||
print(f"Generating optimized reference audio for {accent_name}...")
|
||||
audio = model.generate(
|
||||
text=ref_text,
|
||||
cfg_value=3.0, # Higher CFG for better quality
|
||||
inference_timesteps=30 # More steps for better quality
|
||||
)
|
||||
sf.write(ref_audio, audio, 24000)
|
||||
print(f"Generated optimized reference audio: {ref_audio}")
|
||||
|
||||
# Generate accent demo with optimized parameters
|
||||
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
|
||||
print(f"Generating optimized {accent_name} accent demo...")
|
||||
|
||||
audio = model.generate(
|
||||
text=text,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=3.0, # Higher CFG for better adherence to prompt
|
||||
inference_timesteps=30 # More steps for better quality
|
||||
)
|
||||
|
||||
sf.write(output_file, audio, 24000)
|
||||
print(f"Generated optimized {accent_name} accent demo: {output_file}")
|
||||
return output_file
|
||||
|
||||
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos_optimized"):
|
||||
"""Generate optimized Cantonese pinyin demo"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Generate better Cantonese reference audio
|
||||
ref_audio = "reference_cantonese_opt.wav"
|
||||
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌。"
|
||||
|
||||
if not os.path.exists(ref_audio):
|
||||
print("Generating optimized Cantonese reference audio...")
|
||||
audio = model.generate(
|
||||
text=ref_text,
|
||||
cfg_value=3.0,
|
||||
inference_timesteps=30
|
||||
)
|
||||
sf.write(ref_audio, audio, 24000)
|
||||
print(f"Generated optimized Cantonese reference audio: {ref_audio}")
|
||||
|
||||
# Generate Cantonese pinyin demo
|
||||
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
|
||||
print("Generating optimized Cantonese pinyin demo...")
|
||||
|
||||
audio = model.generate(
|
||||
text=pinyin,
|
||||
prompt_wav_path=ref_audio,
|
||||
prompt_text=ref_text,
|
||||
cfg_value=3.0,
|
||||
inference_timesteps=30
|
||||
)
|
||||
|
||||
sf.write(output_file, audio, 24000)
|
||||
print(f"Generated optimized Cantonese pinyin demo: {output_file}")
|
||||
return output_file
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Initialize VoxCPM
|
||||
print("Initializing VoxCPM...")
|
||||
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
|
||||
|
||||
# Shorter test text for better results
|
||||
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
|
||||
|
||||
# Generate optimized accent demos
|
||||
accents = ["indian", "russian", "singaporean", "hongkong"]
|
||||
for accent in accents:
|
||||
generate_accent_demo(model, test_text, accent)
|
||||
|
||||
# Generate optimized Cantonese pinyin demo
|
||||
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
|
||||
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
|
||||
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
|
||||
|
||||
print("All optimized demos generated successfully!")
|
||||
88
scripts/generation/generate_host_b.py
Normal file
88
scripts/generation/generate_host_b.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def generate_host_b():
|
||||
"""使用Fish Speech生成主持人B的语音"""
|
||||
|
||||
# 主持人B的台词(基于之前的播客内容)
|
||||
host_b_script = """
|
||||
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
|
||||
|
||||
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
|
||||
|
||||
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.
|
||||
"""
|
||||
|
||||
# 保存台词到临时文件
|
||||
script_file = "host_b_script.txt"
|
||||
with open(script_file, "w", encoding="utf-8") as f:
|
||||
f.write(host_b_script.strip())
|
||||
|
||||
print("正在使用Fish Speech生成主持人B的语音...")
|
||||
|
||||
# 使用fish-speech-1.5模型
|
||||
print("使用fish-speech-1.5模型...")
|
||||
server_cmd = [
|
||||
sys.executable, "fish-speech/tools/api_server.py",
|
||||
"--llama-checkpoint-path", "fish-speech/checkpoints/fish-speech-1.5/model.pth",
|
||||
"--decoder-checkpoint-path", "fish-speech/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
|
||||
]
|
||||
|
||||
server_process = subprocess.Popen(
|
||||
server_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
cwd="./"
|
||||
)
|
||||
|
||||
# 等待服务器启动(给足够的时间加载模型)
|
||||
import time
|
||||
print("正在启动服务器,加载模型中...")
|
||||
for i in range(30):
|
||||
time.sleep(1)
|
||||
print(f"启动中... {i+1}/30秒")
|
||||
|
||||
# 发送合成请求
|
||||
client_cmd = [
|
||||
sys.executable, "fish-speech/tools/api_client.py",
|
||||
"--text", host_b_script.strip(),
|
||||
"--reference_audio", "hosts/ben_guanquelou.wav",
|
||||
"--reference_text", "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。",
|
||||
"--output", "podcast_audios/host_b_ben",
|
||||
"--no-play",
|
||||
"--format", "mp3"
|
||||
]
|
||||
|
||||
print("正在发送合成请求...")
|
||||
result = subprocess.run(client_cmd, capture_output=True, text=True, cwd="./")
|
||||
|
||||
# 停止服务器
|
||||
server_process.terminate()
|
||||
|
||||
if result.returncode == 0:
|
||||
print("✅ 主持人B语音生成完成!")
|
||||
print(f"输出文件: podcast_audios/host_b_ben.mp3")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ 生成失败:")
|
||||
print(f"错误: {result.stderr}")
|
||||
print(f"输出: {result.stdout}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 检查模型文件是否存在
|
||||
model_path = "fish-speech/checkpoints/fish-speech-1.5/model.pth"
|
||||
decoder_path = "fish-speech/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
|
||||
|
||||
if not os.path.exists(model_path):
|
||||
print("❌ 模型文件不存在,请先下载模型")
|
||||
print("请运行: bash fish-speech/demo_download.sh")
|
||||
sys.exit(1)
|
||||
|
||||
if not os.path.exists(decoder_path):
|
||||
print("❌ 解码器文件不存在,请先下载模型")
|
||||
print("请运行: bash fish-speech/demo_download.sh")
|
||||
sys.exit(1)
|
||||
|
||||
generate_host_b()
|
||||
142
scripts/generation/generate_moss_ttsd_podcast.py
Executable file
142
scripts/generation/generate_moss_ttsd_podcast.py
Executable file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MOSS-TTSD 播客生成器 - 简化版
|
||||
直接生成到 /root/tts/podcast_audios/
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# 配置
|
||||
OUTPUT_DIR = "/root/tts/podcast_audios"
|
||||
MODEL_DIR = "/root/tts/MOSS-TTSD"
|
||||
|
||||
def generate_podcast(script_file, output_name):
|
||||
"""
|
||||
生成播客并直接保存到 podcast_audios
|
||||
|
||||
参数:
|
||||
script_file: 对话脚本文件路径 (.txt格式,包含[S1] [S2]标签)
|
||||
output_name: 输出文件名 (不需要.wav后缀)
|
||||
"""
|
||||
|
||||
print(f"🎙️ 生成播客: {output_name}")
|
||||
print("=" * 50)
|
||||
|
||||
# 检查模型
|
||||
if not os.path.exists(f"{MODEL_DIR}/MOSS-TTSD-v0.7"):
|
||||
print("❌ MOSS-TTSD模型未下载")
|
||||
return False
|
||||
|
||||
# 检查脚本文件
|
||||
if not os.path.exists(script_file):
|
||||
print(f"❌ 脚本文件不存在: {script_file}")
|
||||
return False
|
||||
|
||||
# 创建临时JSONL文件
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
# 读取脚本
|
||||
with open(script_file, 'r', encoding='utf-8') as f:
|
||||
script_text = f.read().strip()
|
||||
|
||||
# 创建对话数据
|
||||
dialogue_data = {
|
||||
"id": 1,
|
||||
"base_path": "/root/tts/hosts",
|
||||
"text": script_text,
|
||||
"prompt_audio_speaker1": "ben_guanquelou.wav",
|
||||
"prompt_text_speaker1": "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。",
|
||||
"prompt_audio_speaker2": "judy_dalingtaohua_trim.wav",
|
||||
"prompt_text_speaker2": "大林寺桃花,人间四月芳菲尽,山寺桃花始盛开。"
|
||||
}
|
||||
|
||||
# 创建临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as f:
|
||||
json.dump(dialogue_data, f, ensure_ascii=False)
|
||||
f.write('\n')
|
||||
temp_jsonl = f.name
|
||||
|
||||
print(f"✅ 脚本加载成功: {len(script_text)} 字符")
|
||||
|
||||
# 生成音频到临时位置
|
||||
print("🎬 正在生成音频...")
|
||||
cmd = [
|
||||
sys.executable, f"{MODEL_DIR}/inference.py",
|
||||
"--jsonl", temp_jsonl,
|
||||
"--output_dir", "/tmp",
|
||||
"--attn_implementation", "sdpa",
|
||||
"--use_normalize",
|
||||
"--silence_duration", "0.12",
|
||||
"--seed", "42"
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# 删除临时JSONL文件
|
||||
os.unlink(temp_jsonl)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("❌ 音频生成失败")
|
||||
print(result.stderr)
|
||||
return False
|
||||
|
||||
# 检查生成的音频
|
||||
temp_audio = "/tmp/output_0.wav"
|
||||
if not os.path.exists(temp_audio):
|
||||
print("❌ 音频文件未生成")
|
||||
return False
|
||||
|
||||
# 复制到目标位置
|
||||
output_path = f"{OUTPUT_DIR}/{output_name}.wav"
|
||||
subprocess.run(["cp", temp_audio, output_path], check=True)
|
||||
os.unlink(temp_audio)
|
||||
|
||||
# 获取音频信息
|
||||
probe_result = subprocess.run(
|
||||
["ffprobe", output_path, "-v", "quiet", "-show_streams"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
|
||||
duration = "未知"
|
||||
if probe_result.returncode == 0:
|
||||
for line in probe_result.stdout.split('\n'):
|
||||
if line.startswith("duration="):
|
||||
duration = f"{float(line.split('=')[1]):.1f}秒"
|
||||
break
|
||||
|
||||
file_size = os.path.getsize(output_path) / (1024 * 1024)
|
||||
|
||||
print(f"✅ 生成成功!")
|
||||
print(f"📁 文件位置: {output_path}")
|
||||
print(f"📊 文件大小: {file_size:.1f}MB")
|
||||
print(f"⏱️ 音频时长: {duration}")
|
||||
print()
|
||||
print("🎧 播放命令:")
|
||||
print(f" ffplay {output_path}")
|
||||
print(f" # 或")
|
||||
print(f" aplay {output_path}")
|
||||
|
||||
return True
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("用法:")
|
||||
print(f" {sys.argv[0]} <脚本文件> <输出名称>")
|
||||
print()
|
||||
print("示例:")
|
||||
print(f" {sys.argv[0]} chapter8_script.txt chapter8_demo")
|
||||
print()
|
||||
print("脚本文件格式: 纯文本,包含[S1] [S2]标签")
|
||||
print("输出名称: 不需要加.wav后缀")
|
||||
sys.exit(1)
|
||||
|
||||
script_file = sys.argv[1]
|
||||
output_name = sys.argv[2]
|
||||
|
||||
generate_podcast(script_file, output_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
297
scripts/generation/generate_multi_guest_podcast.py
Normal file
297
scripts/generation/generate_multi_guest_podcast.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pydub import AudioSegment
|
||||
from pydub.generators import WhiteNoise
|
||||
import random
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir = "podcast_audios"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
def apply_phone_effect(audio_segment, noise_level=0.02, add_dial_tone=False):
|
||||
"""
|
||||
应用越洋电话音效
|
||||
- 模拟电话带宽限制 (300-3400Hz)
|
||||
- 添加线路噪音
|
||||
- 轻微失真效果
|
||||
- 可选添加拨号音
|
||||
"""
|
||||
# 0. 可选:添加拨号音和接通提示音
|
||||
if add_dial_tone:
|
||||
# 生成拨号音(国际长途拨号音)
|
||||
dial_tone = generate_dial_tone(duration=2000)
|
||||
# 生成接通提示音(短暂的提示音)
|
||||
connect_beep = generate_connect_beep()
|
||||
# 将拨号音和提示音添加到音频开头
|
||||
audio_segment = dial_tone + connect_beep + audio_segment
|
||||
|
||||
# 1. 降低采样率模拟电话音质
|
||||
audio_segment = audio_segment.set_frame_rate(8000)
|
||||
|
||||
# 2. 应用带通滤波器模拟电话频段
|
||||
# pydub没有直接的带通滤波,我们可以通过低通+高通组合实现
|
||||
audio_segment = audio_segment.low_pass_filter(3400)
|
||||
audio_segment = audio_segment.high_pass_filter(300)
|
||||
|
||||
# 3. 生成线路噪音 (呲呲嚓嚓声)
|
||||
# 创建白噪音并调整频谱使其听起来像电话线路噪音
|
||||
noise = WhiteNoise().to_audio_segment(duration=len(audio_segment))
|
||||
noise = noise.low_pass_filter(2000) # 限制噪音高频
|
||||
noise = noise - (60 / noise_level) # 调整音量
|
||||
|
||||
# 4. 添加间歇性的电流干扰声
|
||||
crackle_interval = 3000 # 每3秒一次干扰
|
||||
crackle_duration = 200 # 干扰持续200ms
|
||||
|
||||
for i in range(0, len(audio_segment), crackle_interval):
|
||||
if random.random() < 0.3: # 30%概率触发干扰
|
||||
# 生成短暂的干扰噪音
|
||||
crackle = WhiteNoise().to_audio_segment(duration=crackle_duration)
|
||||
crackle = crackle.low_pass_filter(1000)
|
||||
crackle = crackle - 30 # 较大音量
|
||||
|
||||
# 在指定位置叠加干扰
|
||||
position = i
|
||||
if position + crackle_duration < len(audio_segment):
|
||||
audio_segment = audio_segment.overlay(crackle, position=position)
|
||||
|
||||
# 5. 叠加背景噪音
|
||||
audio_segment = audio_segment.overlay(noise)
|
||||
|
||||
# 6. 轻微压缩动态范围模拟电话线路限制
|
||||
audio_segment = audio_segment.compress_dynamic_range(threshold=-20.0, ratio=4.0)
|
||||
|
||||
return audio_segment
|
||||
|
||||
def generate_dial_tone(duration=2000):
|
||||
"""生成国际长途拨号音"""
|
||||
# 使用双频拨号音 (440Hz + 350Hz)
|
||||
from pydub.generators import Sine
|
||||
tone1 = Sine(440).to_audio_segment(duration=duration)
|
||||
tone2 = Sine(350).to_audio_segment(duration=duration)
|
||||
dial_tone = tone1.overlay(tone2)
|
||||
dial_tone = dial_tone - 25 # 降低音量
|
||||
return dial_tone
|
||||
|
||||
def generate_connect_beep(duration=500):
|
||||
"""生成接通提示音"""
|
||||
from pydub.generators import Sine
|
||||
# 使用1000Hz的提示音
|
||||
beep = Sine(1000).to_audio_segment(duration=duration)
|
||||
beep = beep - 20
|
||||
return beep
|
||||
|
||||
# 对话内容(英文,基于论文内容,包含多个角色)
|
||||
dialogue = [
|
||||
# Host 1 (Male, American) - Alex
|
||||
{
|
||||
"text": "Welcome to Geopolitics Unpacked. I'm Alex.",
|
||||
"voice": "en-US-BrianNeural",
|
||||
"file": "host1_alex_opening.mp3"
|
||||
},
|
||||
# Host 2 (Female, American) - Sarah
|
||||
{
|
||||
"text": "And I'm Sarah. Today we're discussing Ben Xu's paper 'A Tale of 2 Treaties' and exploring the geopolitical dynamics of the Cold War era.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_opening.mp3"
|
||||
},
|
||||
# Host 1 - Alex
|
||||
{
|
||||
"text": "Sarah, the paper introduces this fascinating concept of '轮庄博弈' (turn-based power game) to explain historical cycles. How does this apply to the rise and fall of the Warsaw Pact and NATO?",
|
||||
"voice": "en-US-BrianNeural",
|
||||
"file": "host1_alex_question.mp3"
|
||||
},
|
||||
# Host 2 - Sarah
|
||||
{
|
||||
"text": "It's brilliant. The paper argues that just like in a mahjong game, the '庄家' (庄家) tries to maintain power by exploiting the '闲家' (闲家), but eventually gets overthrown by a coalition of the exploited. Applied to the Cold War, this explains how the Soviet Union's attempts to maintain control over its satellite states led to the collapse of the Warsaw Pact.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_response.mp3"
|
||||
},
|
||||
# Guest 1 (Male, Russian accent) - Dmitri
|
||||
{
|
||||
"text": "Hello, this is Dmitri calling from Moscow. I found the paper's analysis of the Soviet Union's collapse particularly insightful. The author mentions how the Soviet Union's focus on military power at the expense of technological innovation led to its decline. Do you think this is still relevant today?",
|
||||
"voice": "ru-RU-DmitryNeural",
|
||||
"file": "guest1_dmitri_callin.mp3"
|
||||
},
|
||||
# Host 1 - Alex
|
||||
{
|
||||
"text": "Great question, Dmitri. The paper does highlight how the Soviet Union's decision to abandon the Setun ternary computer in favor of copying IBM's binary systems was a critical mistake. This technological stagnation, combined with the arms race,耗尽了 the Soviet economy. What do you think, Sarah?",
|
||||
"voice": "en-US-BrianNeural",
|
||||
"file": "host1_alex_response_to_dmitri.mp3"
|
||||
},
|
||||
# Host 2 - Sarah
|
||||
{
|
||||
"text": "Absolutely, Dmitri. The paper's analysis of the '赛博共产主义' (cyber communism) vision that never materialized is fascinating. The Soviet Union had the technical expertise to develop advanced computing systems, but bureaucratic interests and a focus on military might derailed those efforts. This is a cautionary tale for any nation that prioritizes military power over technological innovation.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_response_to_dmitri.mp3"
|
||||
},
|
||||
# Guest 1 - Dmitri
|
||||
{
|
||||
"text": "Thank you. It's interesting to see how the paper connects these historical lessons to contemporary geopolitics. The rise of China as a technological power while maintaining a strong military presence shows that a balance is possible.",
|
||||
"voice": "ru-RU-DmitryNeural",
|
||||
"file": "guest1_dmitri_conclusion.mp3"
|
||||
},
|
||||
# Host 2 - Sarah
|
||||
{
|
||||
"text": "That's a great point, Dmitri. Thank you for calling in.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_thanks_dmitri.mp3"
|
||||
},
|
||||
# Guest 2 (Female, Indian accent) - Priya
|
||||
{
|
||||
"text": "Hi, this is Priya from New Delhi. I was intrigued by the paper's section on '革命输出的会计困局' (the accounting dilemma of revolution export). The author argues that China's foreign aid policies during the Cold War suffered from conflicting objectives. Could you elaborate on this?",
|
||||
"voice": "en-IN-NeerjaExpressiveNeural",
|
||||
"file": "guest2_priya_callin.mp3"
|
||||
},
|
||||
# Host 1 - Alex
|
||||
{
|
||||
"text": "Thanks for calling, Priya. The paper uses an accounting metaphor to explain the problem. Traditional tributary systems had clear objectives (maintaining political order), but revolutionary export tried to achieve both political returns and selfless aid simultaneously, leading to confusion and inefficiency. Sarah, could you expand on this?",
|
||||
"voice": "en-US-BrianNeural",
|
||||
"file": "host1_alex_response_to_priya.mp3"
|
||||
},
|
||||
# Host 2 - Sarah
|
||||
{
|
||||
"text": "Definitely, Priya. The paper argues that this accounting dilemma led to situations where China provided significant aid to countries like Albania and Vietnam without clear strategic returns. When these relationships soured, it created diplomatic challenges. The author suggests that this experience influenced China's more pragmatic foreign aid policies today, which are more focused on mutual benefit through economic cooperation.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_response_to_priya.mp3"
|
||||
},
|
||||
# Guest 2 - Priya
|
||||
{
|
||||
"text": "Fascinating. This perspective helps explain the evolution of China's foreign policy from the Cold War era to today's Belt and Road Initiative. Thank you for the insight.",
|
||||
"voice": "en-IN-NeerjaExpressiveNeural",
|
||||
"file": "guest2_priya_conclusion.mp3"
|
||||
},
|
||||
# Host 1 - Alex
|
||||
{
|
||||
"text": "Thank you, Priya. It's been great having both of you on the show today.",
|
||||
"voice": "en-US-BrianNeural",
|
||||
"file": "host1_alex_final_thanks.mp3"
|
||||
},
|
||||
# Host 2 - Sarah
|
||||
{
|
||||
"text": "Join us next time as we continue exploring the insights from Ben Xu's 'A Tale of 2 Treaties' and their relevance to contemporary geopolitics. Until then, this is Geopolitics Unpacked signing off.",
|
||||
"voice": "en-US-AriaNeural",
|
||||
"file": "host2_sarah_final.mp3"
|
||||
}
|
||||
]
|
||||
|
||||
# 生成每个角色的音频片段和对应的SRT字幕
|
||||
print("Generating audio segments and subtitles...")
|
||||
for item in dialogue:
|
||||
file_path = os.path.join(output_dir, item["file"])
|
||||
srt_path = os.path.join(output_dir, os.path.splitext(item["file"])[0] + ".srt")
|
||||
cmd = [
|
||||
"edge-tts",
|
||||
"--voice", item["voice"],
|
||||
"--text", item["text"],
|
||||
"--write-media", file_path,
|
||||
"--write-subtitles", srt_path
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
print(f"Generated: {item['file']} and {os.path.basename(srt_path)}")
|
||||
|
||||
# 拼接音频片段
|
||||
print("\nConcatenating audio segments...")
|
||||
combined = AudioSegment.empty()
|
||||
for item in dialogue:
|
||||
file_path = os.path.join(output_dir, item["file"])
|
||||
audio = AudioSegment.from_mp3(file_path)
|
||||
|
||||
# 检查是否为call-in嘉宾(文件名包含'callin')
|
||||
if 'callin' in item["file"].lower():
|
||||
print(f" Applying phone effect to: {item['file']}")
|
||||
audio = apply_phone_effect(audio, add_dial_tone=True) # 添加拨号音
|
||||
# 保存处理后的版本
|
||||
phone_file_path = os.path.join(output_dir, item["file"].replace('.mp3', '_phone.mp3'))
|
||||
audio.export(phone_file_path, format="mp3")
|
||||
|
||||
combined += audio
|
||||
|
||||
# 输出完整播客文件
|
||||
output_file = os.path.join(output_dir, "multi_guest_callin_podcast.mp3")
|
||||
combined.export(output_file, format="mp3")
|
||||
print(f"\nComplete podcast saved to: {output_file}")
|
||||
|
||||
# 合并SRT字幕文件
|
||||
print("\nMerging subtitle files...")
|
||||
|
||||
def parse_srt_time(time_str):
|
||||
"""解析SRT时间格式为毫秒"""
|
||||
h, m, s_ms = time_str.split(':')
|
||||
s, ms = s_ms.split(',')
|
||||
return int(h) * 3600000 + int(m) * 60000 + int(s) * 1000 + int(ms)
|
||||
|
||||
def format_srt_time(ms):
|
||||
"""将毫秒格式化为SRT时间格式"""
|
||||
h = ms // 3600000
|
||||
ms %= 3600000
|
||||
m = ms // 60000
|
||||
ms %= 60000
|
||||
s = ms // 1000
|
||||
ms %= 1000
|
||||
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
|
||||
|
||||
merged_subtitles = []
|
||||
current_time = 0 # 累计时间偏移(毫秒)
|
||||
subtitle_index = 1
|
||||
|
||||
for item in dialogue:
|
||||
srt_path = os.path.join(output_dir, os.path.splitext(item["file"])[0] + ".srt")
|
||||
|
||||
# 读取SRT文件
|
||||
with open(srt_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# 解析字幕内容
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
if line.isdigit():
|
||||
# 字幕序号
|
||||
i += 1
|
||||
# 时间线
|
||||
time_line = lines[i].strip()
|
||||
start_time_str, end_time_str = time_line.split(' --> ')
|
||||
start_time = parse_srt_time(start_time_str)
|
||||
end_time = parse_srt_time(end_time_str)
|
||||
i += 1
|
||||
# 字幕文本
|
||||
text_lines = []
|
||||
while i < len(lines) and lines[i].strip():
|
||||
text_lines.append(lines[i].strip())
|
||||
i += 1
|
||||
text = '\n'.join(text_lines)
|
||||
# 调整时间戳
|
||||
adjusted_start = current_time + start_time
|
||||
adjusted_end = current_time + end_time
|
||||
# 添加到合并列表
|
||||
merged_subtitles.append({
|
||||
'index': subtitle_index,
|
||||
'start': adjusted_start,
|
||||
'end': adjusted_end,
|
||||
'text': text
|
||||
})
|
||||
subtitle_index += 1
|
||||
i += 1
|
||||
|
||||
# 更新累计时间偏移
|
||||
file_path = os.path.join(output_dir, item["file"])
|
||||
# 如果文件被处理过,使用处理后的文件计算时长
|
||||
phone_file_path = os.path.join(output_dir, item["file"].replace('.mp3', '_phone.mp3'))
|
||||
if os.path.exists(phone_file_path):
|
||||
audio = AudioSegment.from_mp3(phone_file_path)
|
||||
else:
|
||||
audio = AudioSegment.from_mp3(file_path)
|
||||
current_time += len(audio) # len(audio)返回毫秒数
|
||||
|
||||
# 生成合并后的SRT文件
|
||||
output_srt = os.path.join(output_dir, "multi_guest_callin_podcast.srt")
|
||||
with open(output_srt, 'w', encoding='utf-8') as f:
|
||||
for sub in merged_subtitles:
|
||||
f.write(f"{sub['index']}\n")
|
||||
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
|
||||
f.write(f"{sub['text']}\n\n")
|
||||
|
||||
print(f"\nComplete subtitle file saved to: {output_srt}")
|
||||
|
||||
print("\nPodcast generation completed successfully!")
|
||||
18
scripts/generation/host_b_config.toml
Normal file
18
scripts/generation/host_b_config.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
# F5-TTS configuration for Host B (Ben)
|
||||
model = "F5TTS_v1_Base"
|
||||
|
||||
[reference]
|
||||
audio = "../hosts/ben_guanquelou.wav"
|
||||
text = "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。"
|
||||
|
||||
[generation]
|
||||
text = """
|
||||
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
|
||||
|
||||
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
|
||||
|
||||
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.
|
||||
"""
|
||||
|
||||
[output]
|
||||
path = "../podcast_audios/host_b_ben_f5.mp3"
|
||||
5
scripts/generation/host_b_script.txt
Normal file
5
scripts/generation/host_b_script.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
|
||||
|
||||
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
|
||||
|
||||
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.
|
||||
Reference in New Issue
Block a user