Files
tts/scripts/generation/generate_moss_ttsd_podcast.py
2026-01-19 10:27:41 +08:00

143 lines
4.1 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
MOSS-TTSD 播客生成器 - 简化版
直接生成到 /root/tts/podcast_audios/
"""
import os
import subprocess
import sys
# 配置
OUTPUT_DIR = "/root/tts/podcast_audios"
MODEL_DIR = "/root/tts/MOSS-TTSD"
def generate_podcast(script_file, output_name):
"""
生成播客并直接保存到 podcast_audios
参数:
script_file: 对话脚本文件路径 (.txt格式包含[S1] [S2]标签)
output_name: 输出文件名 (不需要.wav后缀)
"""
print(f"🎙️ 生成播客: {output_name}")
print("=" * 50)
# 检查模型
if not os.path.exists(f"{MODEL_DIR}/MOSS-TTSD-v0.7"):
print("❌ MOSS-TTSD模型未下载")
return False
# 检查脚本文件
if not os.path.exists(script_file):
print(f"❌ 脚本文件不存在: {script_file}")
return False
# 创建临时JSONL文件
import json
import tempfile
# 读取脚本
with open(script_file, 'r', encoding='utf-8') as f:
script_text = f.read().strip()
# 创建对话数据
dialogue_data = {
"id": 1,
"base_path": "/root/tts/hosts",
"text": script_text,
"prompt_audio_speaker1": "ben_guanquelou.wav",
"prompt_text_speaker1": "白日依山尽,黄河入海流,欲穷千里目,更上一层楼。",
"prompt_audio_speaker2": "judy_dalingtaohua_trim.wav",
"prompt_text_speaker2": "大林寺桃花,人间四月芳菲尽,山寺桃花始盛开。"
}
# 创建临时文件
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as f:
json.dump(dialogue_data, f, ensure_ascii=False)
f.write('\n')
temp_jsonl = f.name
print(f"✅ 脚本加载成功: {len(script_text)} 字符")
# 生成音频到临时位置
print("🎬 正在生成音频...")
cmd = [
sys.executable, f"{MODEL_DIR}/inference.py",
"--jsonl", temp_jsonl,
"--output_dir", "/tmp",
"--attn_implementation", "sdpa",
"--use_normalize",
"--silence_duration", "0.12",
"--seed", "42"
]
result = subprocess.run(cmd, capture_output=True, text=True)
# 删除临时JSONL文件
os.unlink(temp_jsonl)
if result.returncode != 0:
print("❌ 音频生成失败")
print(result.stderr)
return False
# 检查生成的音频
temp_audio = "/tmp/output_0.wav"
if not os.path.exists(temp_audio):
print("❌ 音频文件未生成")
return False
# 复制到目标位置
output_path = f"{OUTPUT_DIR}/{output_name}.wav"
subprocess.run(["cp", temp_audio, output_path], check=True)
os.unlink(temp_audio)
# 获取音频信息
probe_result = subprocess.run(
["ffprobe", output_path, "-v", "quiet", "-show_streams"],
capture_output=True, text=True
)
duration = "未知"
if probe_result.returncode == 0:
for line in probe_result.stdout.split('\n'):
if line.startswith("duration="):
duration = f"{float(line.split('=')[1]):.1f}"
break
file_size = os.path.getsize(output_path) / (1024 * 1024)
print(f"✅ 生成成功!")
print(f"📁 文件位置: {output_path}")
print(f"📊 文件大小: {file_size:.1f}MB")
print(f"⏱️ 音频时长: {duration}")
print()
print("🎧 播放命令:")
print(f" ffplay {output_path}")
print(f" # 或")
print(f" aplay {output_path}")
return True
def main():
if len(sys.argv) != 3:
print("用法:")
print(f" {sys.argv[0]} <脚本文件> <输出名称>")
print()
print("示例:")
print(f" {sys.argv[0]} chapter8_script.txt chapter8_demo")
print()
print("脚本文件格式: 纯文本,包含[S1] [S2]标签")
print("输出名称: 不需要加.wav后缀")
sys.exit(1)
script_file = sys.argv[1]
output_name = sys.argv[2]
generate_podcast(script_file, output_name)
if __name__ == "__main__":
main()