Initial commit for TTS project

This commit is contained in:
Ben
2026-01-19 10:27:41 +08:00
commit a9abd3913d
160 changed files with 11031 additions and 0 deletions

View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Generate accent demos using VoxCPM
Supports: Indian, Russian, Singaporean, Hong Kong English accents
"""
import os
import numpy as np
import soundfile as sf
from voxcpm import VoxCPM
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos"):
"""Generate accent demo audio"""
os.makedirs(output_dir, exist_ok=True)
# Define reference audio paths (will be created if not exist)
ref_audio_map = {
"indian": "reference_indian.wav",
"russian": "reference_russian.wav",
"singaporean": "reference_singaporean.wav",
"hongkong": "reference_hongkong.wav"
}
# Define reference texts that demonstrate accent characteristics
ref_text_map = {
"indian": "Hello, how are you doing today? I'm from Mumbai, India. The weather here is quite warm and humid during the summer months. Would you like to try some delicious Indian cuisine with me?",
"russian": "Hello, how are you doing today? I'm from Moscow, Russia. The winters here are very cold, with lots of snow and ice. But the summers are beautiful and sunny. Would you like to visit the Red Square with me?",
"singaporean": "Hello, how are you doing today? I'm from Singapore. It's a small but vibrant city-state in Southeast Asia. We have delicious hawker food and beautiful gardens. Would you like to try some chicken rice with me?",
"hongkong": "Hello, how are you doing today? I'm from Hong Kong. It's a bustling metropolitan city with amazing skyline and delicious food. We have dim sum, roast goose, and many other Cantonese delicacies. Would you like to go shopping in Causeway Bay with me?"
}
ref_audio = ref_audio_map.get(accent_name)
ref_text = ref_text_map.get(accent_name)
if not ref_audio or not ref_text:
print(f"Invalid accent name: {accent_name}")
return
# Check if reference audio exists (if not, we'll generate it using default voice)
if not os.path.exists(ref_audio):
print(f"Reference audio not found for {accent_name}, generating with default voice...")
# Generate reference audio using default voice
audio = model.generate(
text=ref_text,
cfg_value=2.0,
inference_timesteps=20
)
sf.write(ref_audio, audio, 24000)
print(f"Generated reference audio: {ref_audio}")
# Generate accent demo
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
print(f"Generating {accent_name} accent demo...")
audio = model.generate(
text=text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=2.0,
inference_timesteps=20
)
sf.write(output_file, audio, 24000)
print(f"Generated {accent_name} accent demo: {output_file}")
return output_file
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos"):
"""Generate Cantonese pinyin demo"""
os.makedirs(output_dir, exist_ok=True)
# Generate reference audio for Cantonese accent
ref_audio = "reference_cantonese.wav"
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌和表演。希望你喜欢我的音乐。"
if not os.path.exists(ref_audio):
print("Generating Cantonese reference audio...")
audio = model.generate(
text=ref_text,
cfg_value=2.0,
inference_timesteps=20
)
sf.write(ref_audio, audio, 24000)
print(f"Generated Cantonese reference audio: {ref_audio}")
# Generate Cantonese pinyin demo
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
print("Generating Cantonese pinyin demo...")
audio = model.generate(
text=pinyin,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=2.0,
inference_timesteps=20
)
sf.write(output_file, audio, 24000)
print(f"Generated Cantonese pinyin demo: {output_file}")
return output_file
if __name__ == "__main__":
# Initialize VoxCPM
print("Initializing VoxCPM...")
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
# Test sentence
test_text = "Hello everyone, welcome to our podcast. Today we're going to discuss various accents from around the world. I hope you enjoy this episode!"
# Generate accent demos
accents = ["indian", "russian", "singaporean", "hongkong"]
for accent in accents:
generate_accent_demo(model, test_text, accent)
# Generate Cantonese pinyin demo (Jacky Cheung)
cantonese_text = "张学友是香港著名歌手,被誉为歌神。他的歌声深情动人,深受歌迷喜爱。"
cantonese_pinyin = "{zoeng1}{hau2}{juk6} {si6} {hoeng1}{gong2} {zyu4}{ming4} {go1}{sau2}{bei6}{jyu6} {go1}{san4}{taa1} {dik1} {go1}{sing1} {sam1}{cing4} {dung6}{jan4}{sam1}{sau6} {go1}{mai4} {hei2}{oi3}"
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
print("All demos generated successfully!")

View File

@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
Accent demo generator using LOCAL VoxCPM model
Using the same successful parameters as the Ben voice cloning
"""
import os
import sys
import soundfile as sf
import numpy as np
# Paths
WORKSPACE = "/root/tts"
VOXCPM_DIR = os.path.join(WORKSPACE, "VoxCPM")
OUTPUT_DIR = os.path.join(WORKSPACE, "accent_demos_local")
# Add VoxCPM to path
sys.path.insert(0, os.path.join(VOXCPM_DIR, "src"))
print(f"✅ Added VoxCPM path")
# Import VoxCPM
try:
from voxcpm.core import VoxCPM
print(f"✅ VoxCPM imported successfully")
except Exception as e:
print(f"❌ Failed to import VoxCPM: {e}")
sys.exit(1)
# Use LOCAL model (same as successful Ben voice cloning)
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "openbmb__VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
LOCAL_MODEL_PATH = os.path.join(VOXCPM_DIR, "models", "VoxCPM1.5")
if not os.path.exists(LOCAL_MODEL_PATH):
print(f"❌ Local model path not found")
sys.exit(1)
print(f"✅ Using local model: {LOCAL_MODEL_PATH}")
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Output directory: {OUTPUT_DIR}")
# Initialize VoxCPM with the SAME parameters as successful Ben voice cloning
print(f"\n🚀 Initializing VoxCPM with successful parameters...")
try:
model = VoxCPM(
voxcpm_model_path=LOCAL_MODEL_PATH,
enable_denoiser=False, # Disable denoiser for better quality
optimize=False # Disable optimization to avoid issues
)
print(f"✅ VoxCPM initialized successfully")
except Exception as e:
print(f"❌ VoxCPM initialization failed: {e}")
sys.exit(1)
# Use REAL reference audio files (the ones that worked for Ben)
REAL_BEN_REF = os.path.join(WORKSPACE, "hosts", "ben_guanquelou.wav")
REAL_JUDY_REF = os.path.join(WORKSPACE, "hosts", "judy_tixilingbi.MP3")
print(f"✅ Ben reference audio: {REAL_BEN_REF}")
print(f"✅ Judy reference audio: {REAL_JUDY_REF}")
# Reference texts that MATCH the audio
REFERENCE_TEXTS = {
"ben": "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
"judy": "题西林壁,横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。"
}
def generate_accent_demo_with_real_reference(text, accent_name, output_dir=OUTPUT_DIR):
"""Generate accent demo using REAL reference audio"""
# Use Ben's reference audio as base (since it worked well)
ref_audio = REAL_BEN_REF
ref_text = REFERENCE_TEXTS["ben"]
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
print(f"\n🎙️ Generating {accent_name} accent demo...")
print(f"Text: {text[:50]}...")
try:
# Generate audio with the SAME parameters as successful Ben voice cloning
audio = model.generate(
text=text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=2.0, # Same as successful Ben
inference_timesteps=20, # Same as successful Ben
normalize=True, # Enable text normalization
denoise=False, # Disable denoise
retry_badcase=True # Enable retry for bad cases
)
# Save audio
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Generated successfully!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
def generate_cantonese_pinyin_demo(text, pinyin, output_dir=OUTPUT_DIR):
"""Generate Cantonese pinyin demo"""
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
print(f"\n🎙️ Generating Cantonese pinyin demo...")
print(f"Text: {text[:50]}...")
try:
# Generate audio with the SAME parameters
audio = model.generate(
text=pinyin,
prompt_wav_path=REAL_BEN_REF, # Use Ben's reference
prompt_text=REFERENCE_TEXTS["ben"],
cfg_value=2.0,
inference_timesteps=20,
normalize=True,
denoise=False,
retry_badcase=True
)
# Save audio
sf.write(output_file, audio, model.tts_model.sample_rate)
# Verify
if os.path.exists(output_file):
file_size = os.path.getsize(output_file)
duration = len(audio) / model.tts_model.sample_rate
print(f"✅ Generated successfully!")
print(f" File: {output_file}")
print(f" Size: {file_size} bytes")
print(f" Duration: {duration:.2f} seconds")
else:
print(f"❌ Failed to save")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# Test sentence (same as before)
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
# Generate accent demos using REAL reference audio
accents = ["indian", "russian", "singaporean", "hongkong"]
for accent in accents:
generate_accent_demo_with_real_reference(test_text, accent)
# Generate Cantonese pinyin demo
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
generate_cantonese_pinyin_demo(cantonese_text, cantonese_pinyin)
print(f"\n{'='*70}")
print(f"ACCENT DEMOS GENERATION COMPLETE")
print(f"{'='*70}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nAll demos generated with the SAME parameters that worked for Ben's voice!")

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
Optimized accent demo generator using VoxCPM
Improved version with better parameters and shorter text
"""
import os
import numpy as np
import soundfile as sf
from voxcpm import VoxCPM
def generate_accent_demo(model, text, accent_name, output_dir="accent_demos_optimized"):
"""Generate optimized accent demo audio"""
os.makedirs(output_dir, exist_ok=True)
# Define reference audio paths
ref_audio_map = {
"indian": "reference_indian_opt.wav",
"russian": "reference_russian_opt.wav",
"singaporean": "reference_singaporean_opt.wav",
"hongkong": "reference_hongkong_opt.wav"
}
# Define better reference texts (shorter, more natural)
ref_text_map = {
"indian": "Hello there! How are you today? I'm from India. The weather here is quite warm.",
"russian": "Hello! How are you doing? I'm from Russia. The winters here are very cold.",
"singaporean": "Hi! How's it going? I'm from Singapore. We have delicious hawker food here.",
"hongkong": "Hey! How are you? I'm from Hong Kong. It's a bustling city with amazing food."
}
ref_audio = ref_audio_map.get(accent_name)
ref_text = ref_text_map.get(accent_name)
if not ref_audio or not ref_text:
print(f"Invalid accent name: {accent_name}")
return
# Generate high-quality reference audio
if not os.path.exists(ref_audio):
print(f"Generating optimized reference audio for {accent_name}...")
audio = model.generate(
text=ref_text,
cfg_value=3.0, # Higher CFG for better quality
inference_timesteps=30 # More steps for better quality
)
sf.write(ref_audio, audio, 24000)
print(f"Generated optimized reference audio: {ref_audio}")
# Generate accent demo with optimized parameters
output_file = os.path.join(output_dir, f"{accent_name}_demo.wav")
print(f"Generating optimized {accent_name} accent demo...")
audio = model.generate(
text=text,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=3.0, # Higher CFG for better adherence to prompt
inference_timesteps=30 # More steps for better quality
)
sf.write(output_file, audio, 24000)
print(f"Generated optimized {accent_name} accent demo: {output_file}")
return output_file
def generate_cantonese_pinyin_demo(model, text, pinyin, output_dir="accent_demos_optimized"):
"""Generate optimized Cantonese pinyin demo"""
os.makedirs(output_dir, exist_ok=True)
# Generate better Cantonese reference audio
ref_audio = "reference_cantonese_opt.wav"
ref_text = "你好,我是张学友。很高兴认识你。我喜欢唱歌。"
if not os.path.exists(ref_audio):
print("Generating optimized Cantonese reference audio...")
audio = model.generate(
text=ref_text,
cfg_value=3.0,
inference_timesteps=30
)
sf.write(ref_audio, audio, 24000)
print(f"Generated optimized Cantonese reference audio: {ref_audio}")
# Generate Cantonese pinyin demo
output_file = os.path.join(output_dir, "cantonese_pinyin_demo.wav")
print("Generating optimized Cantonese pinyin demo...")
audio = model.generate(
text=pinyin,
prompt_wav_path=ref_audio,
prompt_text=ref_text,
cfg_value=3.0,
inference_timesteps=30
)
sf.write(output_file, audio, 24000)
print(f"Generated optimized Cantonese pinyin demo: {output_file}")
return output_file
if __name__ == "__main__":
# Initialize VoxCPM
print("Initializing VoxCPM...")
model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
# Shorter test text for better results
test_text = "Hello everyone! Welcome to our podcast. I hope you enjoy this episode!"
# Generate optimized accent demos
accents = ["indian", "russian", "singaporean", "hongkong"]
for accent in accents:
generate_accent_demo(model, test_text, accent)
# Generate optimized Cantonese pinyin demo
cantonese_text = "张学友是香港著名歌手,被誉为歌神。"
cantonese_pinyin = "Zhang Xueyou is a famous Hong Kong singer, known as the God of Songs."
generate_cantonese_pinyin_demo(model, cantonese_text, cantonese_pinyin)
print("All optimized demos generated successfully!")

View File

@@ -0,0 +1,88 @@
import os
import subprocess
import sys
def generate_host_b():
"""使用Fish Speech生成主持人B的语音"""
# 主持人B的台词基于之前的播客内容
host_b_script = """
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.
"""
# 保存台词到临时文件
script_file = "host_b_script.txt"
with open(script_file, "w", encoding="utf-8") as f:
f.write(host_b_script.strip())
print("正在使用Fish Speech生成主持人B的语音...")
# 使用fish-speech-1.5模型
print("使用fish-speech-1.5模型...")
server_cmd = [
sys.executable, "fish-speech/tools/api_server.py",
"--llama-checkpoint-path", "fish-speech/checkpoints/fish-speech-1.5/model.pth",
"--decoder-checkpoint-path", "fish-speech/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
]
server_process = subprocess.Popen(
server_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd="./"
)
# 等待服务器启动(给足够的时间加载模型)
import time
print("正在启动服务器,加载模型中...")
for i in range(30):
time.sleep(1)
print(f"启动中... {i+1}/30秒")
# 发送合成请求
client_cmd = [
sys.executable, "fish-speech/tools/api_client.py",
"--text", host_b_script.strip(),
"--reference_audio", "hosts/ben_guanquelou.wav",
"--reference_text", "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。",
"--output", "podcast_audios/host_b_ben",
"--no-play",
"--format", "mp3"
]
print("正在发送合成请求...")
result = subprocess.run(client_cmd, capture_output=True, text=True, cwd="./")
# 停止服务器
server_process.terminate()
if result.returncode == 0:
print("✅ 主持人B语音生成完成!")
print(f"输出文件: podcast_audios/host_b_ben.mp3")
return True
else:
print(f"❌ 生成失败:")
print(f"错误: {result.stderr}")
print(f"输出: {result.stdout}")
return False
if __name__ == "__main__":
# 检查模型文件是否存在
model_path = "fish-speech/checkpoints/fish-speech-1.5/model.pth"
decoder_path = "fish-speech/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
if not os.path.exists(model_path):
print("❌ 模型文件不存在,请先下载模型")
print("请运行: bash fish-speech/demo_download.sh")
sys.exit(1)
if not os.path.exists(decoder_path):
print("❌ 解码器文件不存在,请先下载模型")
print("请运行: bash fish-speech/demo_download.sh")
sys.exit(1)
generate_host_b()

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
MOSS-TTSD 播客生成器 - 简化版
直接生成到 /root/tts/podcast_audios/
"""
import os
import subprocess
import sys
# 配置
OUTPUT_DIR = "/root/tts/podcast_audios"
MODEL_DIR = "/root/tts/MOSS-TTSD"
def generate_podcast(script_file, output_name):
"""
生成播客并直接保存到 podcast_audios
参数:
script_file: 对话脚本文件路径 (.txt格式包含[S1] [S2]标签)
output_name: 输出文件名 (不需要.wav后缀)
"""
print(f"🎙️ 生成播客: {output_name}")
print("=" * 50)
# 检查模型
if not os.path.exists(f"{MODEL_DIR}/MOSS-TTSD-v0.7"):
print("❌ MOSS-TTSD模型未下载")
return False
# 检查脚本文件
if not os.path.exists(script_file):
print(f"❌ 脚本文件不存在: {script_file}")
return False
# 创建临时JSONL文件
import json
import tempfile
# 读取脚本
with open(script_file, 'r', encoding='utf-8') as f:
script_text = f.read().strip()
# 创建对话数据
dialogue_data = {
"id": 1,
"base_path": "/root/tts/hosts",
"text": script_text,
"prompt_audio_speaker1": "ben_guanquelou.wav",
"prompt_text_speaker1": "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。",
"prompt_audio_speaker2": "judy_dalingtaohua_trim.wav",
"prompt_text_speaker2": "大林寺桃花,人间四月芳菲尽,山寺桃花始盛开。"
}
# 创建临时文件
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as f:
json.dump(dialogue_data, f, ensure_ascii=False)
f.write('\n')
temp_jsonl = f.name
print(f"✅ 脚本加载成功: {len(script_text)} 字符")
# 生成音频到临时位置
print("🎬 正在生成音频...")
cmd = [
sys.executable, f"{MODEL_DIR}/inference.py",
"--jsonl", temp_jsonl,
"--output_dir", "/tmp",
"--attn_implementation", "sdpa",
"--use_normalize",
"--silence_duration", "0.12",
"--seed", "42"
]
result = subprocess.run(cmd, capture_output=True, text=True)
# 删除临时JSONL文件
os.unlink(temp_jsonl)
if result.returncode != 0:
print("❌ 音频生成失败")
print(result.stderr)
return False
# 检查生成的音频
temp_audio = "/tmp/output_0.wav"
if not os.path.exists(temp_audio):
print("❌ 音频文件未生成")
return False
# 复制到目标位置
output_path = f"{OUTPUT_DIR}/{output_name}.wav"
subprocess.run(["cp", temp_audio, output_path], check=True)
os.unlink(temp_audio)
# 获取音频信息
probe_result = subprocess.run(
["ffprobe", output_path, "-v", "quiet", "-show_streams"],
capture_output=True, text=True
)
duration = "未知"
if probe_result.returncode == 0:
for line in probe_result.stdout.split('\n'):
if line.startswith("duration="):
duration = f"{float(line.split('=')[1]):.1f}"
break
file_size = os.path.getsize(output_path) / (1024 * 1024)
print(f"✅ 生成成功!")
print(f"📁 文件位置: {output_path}")
print(f"📊 文件大小: {file_size:.1f}MB")
print(f"⏱️ 音频时长: {duration}")
print()
print("🎧 播放命令:")
print(f" ffplay {output_path}")
print(f" # 或")
print(f" aplay {output_path}")
return True
def main():
if len(sys.argv) != 3:
print("用法:")
print(f" {sys.argv[0]} <脚本文件> <输出名称>")
print()
print("示例:")
print(f" {sys.argv[0]} chapter8_script.txt chapter8_demo")
print()
print("脚本文件格式: 纯文本,包含[S1] [S2]标签")
print("输出名称: 不需要加.wav后缀")
sys.exit(1)
script_file = sys.argv[1]
output_name = sys.argv[2]
generate_podcast(script_file, output_name)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,297 @@
import os
import subprocess
from pydub import AudioSegment
from pydub.generators import WhiteNoise
import random
# 确保输出目录存在
output_dir = "podcast_audios"
os.makedirs(output_dir, exist_ok=True)
def apply_phone_effect(audio_segment, noise_level=0.02, add_dial_tone=False):
"""
应用越洋电话音效
- 模拟电话带宽限制 (300-3400Hz)
- 添加线路噪音
- 轻微失真效果
- 可选添加拨号音
"""
# 0. 可选:添加拨号音和接通提示音
if add_dial_tone:
# 生成拨号音(国际长途拨号音)
dial_tone = generate_dial_tone(duration=2000)
# 生成接通提示音(短暂的提示音)
connect_beep = generate_connect_beep()
# 将拨号音和提示音添加到音频开头
audio_segment = dial_tone + connect_beep + audio_segment
# 1. 降低采样率模拟电话音质
audio_segment = audio_segment.set_frame_rate(8000)
# 2. 应用带通滤波器模拟电话频段
# pydub没有直接的带通滤波我们可以通过低通+高通组合实现
audio_segment = audio_segment.low_pass_filter(3400)
audio_segment = audio_segment.high_pass_filter(300)
# 3. 生成线路噪音 (呲呲嚓嚓声)
# 创建白噪音并调整频谱使其听起来像电话线路噪音
noise = WhiteNoise().to_audio_segment(duration=len(audio_segment))
noise = noise.low_pass_filter(2000) # 限制噪音高频
noise = noise - (60 / noise_level) # 调整音量
# 4. 添加间歇性的电流干扰声
crackle_interval = 3000 # 每3秒一次干扰
crackle_duration = 200 # 干扰持续200ms
for i in range(0, len(audio_segment), crackle_interval):
if random.random() < 0.3: # 30%概率触发干扰
# 生成短暂的干扰噪音
crackle = WhiteNoise().to_audio_segment(duration=crackle_duration)
crackle = crackle.low_pass_filter(1000)
crackle = crackle - 30 # 较大音量
# 在指定位置叠加干扰
position = i
if position + crackle_duration < len(audio_segment):
audio_segment = audio_segment.overlay(crackle, position=position)
# 5. 叠加背景噪音
audio_segment = audio_segment.overlay(noise)
# 6. 轻微压缩动态范围模拟电话线路限制
audio_segment = audio_segment.compress_dynamic_range(threshold=-20.0, ratio=4.0)
return audio_segment
def generate_dial_tone(duration=2000):
"""生成国际长途拨号音"""
# 使用双频拨号音 (440Hz + 350Hz)
from pydub.generators import Sine
tone1 = Sine(440).to_audio_segment(duration=duration)
tone2 = Sine(350).to_audio_segment(duration=duration)
dial_tone = tone1.overlay(tone2)
dial_tone = dial_tone - 25 # 降低音量
return dial_tone
def generate_connect_beep(duration=500):
"""生成接通提示音"""
from pydub.generators import Sine
# 使用1000Hz的提示音
beep = Sine(1000).to_audio_segment(duration=duration)
beep = beep - 20
return beep
# 对话内容(英文,基于论文内容,包含多个角色)
dialogue = [
# Host 1 (Male, American) - Alex
{
"text": "Welcome to Geopolitics Unpacked. I'm Alex.",
"voice": "en-US-BrianNeural",
"file": "host1_alex_opening.mp3"
},
# Host 2 (Female, American) - Sarah
{
"text": "And I'm Sarah. Today we're discussing Ben Xu's paper 'A Tale of 2 Treaties' and exploring the geopolitical dynamics of the Cold War era.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_opening.mp3"
},
# Host 1 - Alex
{
"text": "Sarah, the paper introduces this fascinating concept of '轮庄博弈' (turn-based power game) to explain historical cycles. How does this apply to the rise and fall of the Warsaw Pact and NATO?",
"voice": "en-US-BrianNeural",
"file": "host1_alex_question.mp3"
},
# Host 2 - Sarah
{
"text": "It's brilliant. The paper argues that just like in a mahjong game, the '庄家' (庄家) tries to maintain power by exploiting the '闲家' (闲家), but eventually gets overthrown by a coalition of the exploited. Applied to the Cold War, this explains how the Soviet Union's attempts to maintain control over its satellite states led to the collapse of the Warsaw Pact.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_response.mp3"
},
# Guest 1 (Male, Russian accent) - Dmitri
{
"text": "Hello, this is Dmitri calling from Moscow. I found the paper's analysis of the Soviet Union's collapse particularly insightful. The author mentions how the Soviet Union's focus on military power at the expense of technological innovation led to its decline. Do you think this is still relevant today?",
"voice": "ru-RU-DmitryNeural",
"file": "guest1_dmitri_callin.mp3"
},
# Host 1 - Alex
{
"text": "Great question, Dmitri. The paper does highlight how the Soviet Union's decision to abandon the Setun ternary computer in favor of copying IBM's binary systems was a critical mistake. This technological stagnation, combined with the arms race,耗尽了 the Soviet economy. What do you think, Sarah?",
"voice": "en-US-BrianNeural",
"file": "host1_alex_response_to_dmitri.mp3"
},
# Host 2 - Sarah
{
"text": "Absolutely, Dmitri. The paper's analysis of the '赛博共产主义' (cyber communism) vision that never materialized is fascinating. The Soviet Union had the technical expertise to develop advanced computing systems, but bureaucratic interests and a focus on military might derailed those efforts. This is a cautionary tale for any nation that prioritizes military power over technological innovation.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_response_to_dmitri.mp3"
},
# Guest 1 - Dmitri
{
"text": "Thank you. It's interesting to see how the paper connects these historical lessons to contemporary geopolitics. The rise of China as a technological power while maintaining a strong military presence shows that a balance is possible.",
"voice": "ru-RU-DmitryNeural",
"file": "guest1_dmitri_conclusion.mp3"
},
# Host 2 - Sarah
{
"text": "That's a great point, Dmitri. Thank you for calling in.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_thanks_dmitri.mp3"
},
# Guest 2 (Female, Indian accent) - Priya
{
"text": "Hi, this is Priya from New Delhi. I was intrigued by the paper's section on '革命输出的会计困局' (the accounting dilemma of revolution export). The author argues that China's foreign aid policies during the Cold War suffered from conflicting objectives. Could you elaborate on this?",
"voice": "en-IN-NeerjaExpressiveNeural",
"file": "guest2_priya_callin.mp3"
},
# Host 1 - Alex
{
"text": "Thanks for calling, Priya. The paper uses an accounting metaphor to explain the problem. Traditional tributary systems had clear objectives (maintaining political order), but revolutionary export tried to achieve both political returns and selfless aid simultaneously, leading to confusion and inefficiency. Sarah, could you expand on this?",
"voice": "en-US-BrianNeural",
"file": "host1_alex_response_to_priya.mp3"
},
# Host 2 - Sarah
{
"text": "Definitely, Priya. The paper argues that this accounting dilemma led to situations where China provided significant aid to countries like Albania and Vietnam without clear strategic returns. When these relationships soured, it created diplomatic challenges. The author suggests that this experience influenced China's more pragmatic foreign aid policies today, which are more focused on mutual benefit through economic cooperation.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_response_to_priya.mp3"
},
# Guest 2 - Priya
{
"text": "Fascinating. This perspective helps explain the evolution of China's foreign policy from the Cold War era to today's Belt and Road Initiative. Thank you for the insight.",
"voice": "en-IN-NeerjaExpressiveNeural",
"file": "guest2_priya_conclusion.mp3"
},
# Host 1 - Alex
{
"text": "Thank you, Priya. It's been great having both of you on the show today.",
"voice": "en-US-BrianNeural",
"file": "host1_alex_final_thanks.mp3"
},
# Host 2 - Sarah
{
"text": "Join us next time as we continue exploring the insights from Ben Xu's 'A Tale of 2 Treaties' and their relevance to contemporary geopolitics. Until then, this is Geopolitics Unpacked signing off.",
"voice": "en-US-AriaNeural",
"file": "host2_sarah_final.mp3"
}
]
# 生成每个角色的音频片段和对应的SRT字幕
print("Generating audio segments and subtitles...")
for item in dialogue:
file_path = os.path.join(output_dir, item["file"])
srt_path = os.path.join(output_dir, os.path.splitext(item["file"])[0] + ".srt")
cmd = [
"edge-tts",
"--voice", item["voice"],
"--text", item["text"],
"--write-media", file_path,
"--write-subtitles", srt_path
]
subprocess.run(cmd, check=True)
print(f"Generated: {item['file']} and {os.path.basename(srt_path)}")
# 拼接音频片段
print("\nConcatenating audio segments...")
combined = AudioSegment.empty()
for item in dialogue:
file_path = os.path.join(output_dir, item["file"])
audio = AudioSegment.from_mp3(file_path)
# 检查是否为call-in嘉宾文件名包含'callin'
if 'callin' in item["file"].lower():
print(f" Applying phone effect to: {item['file']}")
audio = apply_phone_effect(audio, add_dial_tone=True) # 添加拨号音
# 保存处理后的版本
phone_file_path = os.path.join(output_dir, item["file"].replace('.mp3', '_phone.mp3'))
audio.export(phone_file_path, format="mp3")
combined += audio
# 输出完整播客文件
output_file = os.path.join(output_dir, "multi_guest_callin_podcast.mp3")
combined.export(output_file, format="mp3")
print(f"\nComplete podcast saved to: {output_file}")
# 合并SRT字幕文件
print("\nMerging subtitle files...")
def parse_srt_time(time_str):
"""解析SRT时间格式为毫秒"""
h, m, s_ms = time_str.split(':')
s, ms = s_ms.split(',')
return int(h) * 3600000 + int(m) * 60000 + int(s) * 1000 + int(ms)
def format_srt_time(ms):
"""将毫秒格式化为SRT时间格式"""
h = ms // 3600000
ms %= 3600000
m = ms // 60000
ms %= 60000
s = ms // 1000
ms %= 1000
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
merged_subtitles = []
current_time = 0 # 累计时间偏移(毫秒)
subtitle_index = 1
for item in dialogue:
srt_path = os.path.join(output_dir, os.path.splitext(item["file"])[0] + ".srt")
# 读取SRT文件
with open(srt_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 解析字幕内容
i = 0
while i < len(lines):
line = lines[i].strip()
if line.isdigit():
# 字幕序号
i += 1
# 时间线
time_line = lines[i].strip()
start_time_str, end_time_str = time_line.split(' --> ')
start_time = parse_srt_time(start_time_str)
end_time = parse_srt_time(end_time_str)
i += 1
# 字幕文本
text_lines = []
while i < len(lines) and lines[i].strip():
text_lines.append(lines[i].strip())
i += 1
text = '\n'.join(text_lines)
# 调整时间戳
adjusted_start = current_time + start_time
adjusted_end = current_time + end_time
# 添加到合并列表
merged_subtitles.append({
'index': subtitle_index,
'start': adjusted_start,
'end': adjusted_end,
'text': text
})
subtitle_index += 1
i += 1
# 更新累计时间偏移
file_path = os.path.join(output_dir, item["file"])
# 如果文件被处理过,使用处理后的文件计算时长
phone_file_path = os.path.join(output_dir, item["file"].replace('.mp3', '_phone.mp3'))
if os.path.exists(phone_file_path):
audio = AudioSegment.from_mp3(phone_file_path)
else:
audio = AudioSegment.from_mp3(file_path)
current_time += len(audio) # len(audio)返回毫秒数
# 生成合并后的SRT文件
output_srt = os.path.join(output_dir, "multi_guest_callin_podcast.srt")
with open(output_srt, 'w', encoding='utf-8') as f:
for sub in merged_subtitles:
f.write(f"{sub['index']}\n")
f.write(f"{format_srt_time(sub['start'])} --> {format_srt_time(sub['end'])}\n")
f.write(f"{sub['text']}\n\n")
print(f"\nComplete subtitle file saved to: {output_srt}")
print("\nPodcast generation completed successfully!")

View File

@@ -0,0 +1,18 @@
# F5-TTS configuration for Host B (Ben)
model = "F5TTS_v1_Base"
[reference]
audio = "../hosts/ben_guanquelou.wav"
text = "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。"
[generation]
text = """
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.
"""
[output]
path = "../podcast_audios/host_b_ben_f5.mp3"

View File

@@ -0,0 +1,5 @@
Sarah, the paper's analysis of the Soviet Union's collapse is really thought-provoking. The author's concept of '轮庄博弈' (turn-based power game) perfectly explains why the Warsaw Pact eventually dissolved. It's fascinating how the paper connects historical patterns to modern geopolitics.
Regarding the 'accounting dilemma of revolution export' that Priya mentioned, I think the paper makes a crucial point. China's foreign aid policies during the Cold War struggled because they tried to balance political objectives with genuine humanitarian assistance. This tension is something we still see in international relations today.
The paper's discussion of technological innovation versus military spending is particularly relevant. The Soviet Union's decision to prioritize military power over technological development ultimately led to its decline. This is a lesson that all nations should heed in the modern era of rapid technological change.